diff mbox

[ovs-dev,v2,04/15] conntrack: New userspace connection tracker.

Message ID 1460764967-110943-5-git-send-email-diproiettod@vmware.com
State Changes Requested
Headers show

Commit Message

Daniele Di Proietto April 16, 2016, 12:02 a.m. UTC
This commit adds the conntrack module.

It is a connection tracker that resides entirely in userspace.  Its
primary user will be the dpif-netdev datapath.

The module main goal is to provide conntrack_execute(), which offers a
convenient interface to implement the datapath ct() action.

The conntrack module uses two submodules to deal with the l4 protocol
details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
subsystem, therefore it's BSD licensed.  It has been slightly altered to
match the OVS coding style and to allow the pickup of already
established connections.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
---
 COPYING                 |   1 +
 debian/copyright.in     |   4 +
 lib/automake.mk         |   5 +
 lib/conntrack-other.c   |  91 ++++++
 lib/conntrack-private.h |  77 +++++
 lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++
 lib/conntrack.c         | 851 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/conntrack.h         | 144 ++++++++
 8 files changed, 1649 insertions(+)
 create mode 100644 lib/conntrack-other.c
 create mode 100644 lib/conntrack-private.h
 create mode 100644 lib/conntrack-tcp.c
 create mode 100644 lib/conntrack.c
 create mode 100644 lib/conntrack.h

Comments

Joe Stringer April 19, 2016, 9:36 p.m. UTC | #1
On 15 April 2016 at 17:02, Daniele Di Proietto <diproiettod@vmware.com> wrote:
> This commit adds the conntrack module.
>
> It is a connection tracker that resides entirely in userspace.  Its
> primary user will be the dpif-netdev datapath.
>
> The module main goal is to provide conntrack_execute(), which offers a
> convenient interface to implement the datapath ct() action.
>
> The conntrack module uses two submodules to deal with the l4 protocol
> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).
>
> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
> subsystem, therefore it's BSD licensed.  It has been slightly altered to
> match the OVS coding style and to allow the pickup of already
> established connections.
>
> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

Thanks for submitting this, I know there's a few people excited about
this feature.

I notice that there is no NEWS item about this, were you intending to
hold off on announcing it until there is feature parity with the
kernel, eg support for IPv[46] fragments; ALGs; NAT?


> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
> new file mode 100644
> index 0000000..65d02a9
> --- /dev/null
> +++ b/lib/conntrack-other.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "conntrack-private.h"
> +#include "dp-packet.h"
> +
> +enum other_state {
> +    OTHERS_FIRST,
> +    OTHERS_MULTIPLE,
> +    OTHERS_BIDIR,
> +};
> +
> +struct conn_other {
> +    struct conn up;
> +    enum other_state state;
> +};
> +
> +static const long long other_timeouts[] = {
> +    [OTHERS_FIRST] = 60 * 1000,
> +    [OTHERS_MULTIPLE] = 60 * 1000,
> +    [OTHERS_BIDIR] = 30 * 1000,
> +};

Are these timeouts in milliseconds? (a comment or #define might help with that)

<snip>

> diff --git a/lib/conntrack.c b/lib/conntrack.c
> new file mode 100644
> index 0000000..840335b
> --- /dev/null
> +++ b/lib/conntrack.c
...
> +static struct ct_l4_proto *l4_protos[] = {
> +    [IPPROTO_TCP] = &ct_proto_tcp,
> +    [IPPROTO_UDP] = &ct_proto_other,
> +    [IPPROTO_ICMP] = &ct_proto_other,
> +};

I see that conntrack-other is shared by UDP and ICMP. In the Linux
datapath, the UDP handler also checks UDP length and UDP checksum - I
wonder if we can share most of the code here for these protocols but
add these additional checks for the UDP case?

> +static struct conn *
> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,
> +               long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *nc = NULL;
> +
> +    if (!valid_new(pkt, &ctx->key)) {
> +        *state |= CS_INVALID;
> +        return nc;
> +    }
> +
> +    *state |= CS_NEW;
> +
> +    if (commit) {
> +        nc = new_conn(pkt, &ctx->key, now);
> +
> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
> +
> +        conn_key_reverse(&nc->rev_key);
> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);
> +    }
> +
> +    return nc;
> +}

This function can return NULL....

> +static struct conn *
> +process_one(struct conntrack *ct, struct dp_packet *pkt,
> +            struct conn_lookup_ctx *ctx, uint16_t zone,
> +            bool commit, long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *conn = ctx->conn;
> +    uint8_t state = 0;
> +
> +    if (conn) {
> +        if (ctx->related) {
> +            state |= CS_RELATED;
> +            if (ctx->reply) {
> +                state |= CS_REPLY_DIR;
> +            }
> +        } else {
> +            enum ct_update_res res;
> +
> +            res = conn_update(conn, pkt, ctx->reply, now);
> +
> +            switch (res) {
> +            case CT_UPDATE_VALID:
> +                state |= CS_ESTABLISHED;
> +                if (ctx->reply) {
> +                    state |= CS_REPLY_DIR;
> +                }
> +                break;
> +            case CT_UPDATE_INVALID:
> +                state |= CS_INVALID;
> +                break;
> +            case CT_UPDATE_NEW:
> +                hmap_remove(&ct->connections[bucket], &conn->node);
> +                delete_conn(conn);
> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

...which is then stored here...

> +                break;
> +            }
> +        }
> +
> +        pkt->md.ct_label = conn->label;
> +        pkt->md.ct_mark = conn->mark;
> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

...and dereferenced here. Is it possible to send invalid packets for
an existing connection and cause a NULL pointer dereference?


> +    } else {
> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

Maybe we can define OVS_U128_MIN (ovs_u128) {{0}} in a header somewhere?


> +/* Cleans up old connection entries.  Should be called periodically. */
> +void
> +conntrack_run(struct conntrack *ct)
> +{
> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);
> +    uint32_t inner_bucket = ct->purge_inner_bucket,
> +             inner_offset = ct->purge_inner_offset;

Style seems a little unusual for the above 'inner_offset'.


> +static inline bool
> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)
> +{
> +    const struct udp_header *udp = data;
> +
> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
> +        return false;
> +    }

Maybe my comment earlier about UDP len and checksum could be addressed here?

...


> +static bool
> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
> +                 struct conn_lookup_ctx *ctx, uint16_t zone)
> +{
> +    const struct eth_header *l2 = dp_packet_l2(pkt);
> +    const struct ip_header *l3 = dp_packet_l3(pkt);
> +    const char *l4 = dp_packet_l4(pkt);
> +    const char *tail = dp_packet_tail(pkt);
> +    bool ok;
> +
> +    memset(ctx, 0, sizeof *ctx);
> +
> +    if (!l2 || !l3 || !l4) {
> +        return false;
> +    }
> +
> +    ctx->key.zone = zone;
> +
> +    /* XXX In this function we parse the packet (again, it has already
> +     * gone through miniflow_extract()) for two reasons:
> +     *
> +     * 1) To extract the l3 addresses and l4 ports.
> +     *    We already have the l3 and l4 headers' pointers.  Extracting
> +     *    the l3 addresses and the l4 ports is really cheap, since they
> +     *    can be found at fixed locations.
> +     * 2) To extract the l3 and l4 types.
> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the
> +     *    other hand is quite expensive, because they're not at a
> +     *    fixed location.
> +     *
> +     * Here's a way to avoid (2) with the help of the datapath.
> +     * The datapath doesn't keep the packet's extracted flow[2], so
> +     * using that is not an option.  We could use the packet's matching
> +     * megaflow, but we have to make sure that the l3 and l4 types
> +     * are unwildcarded.  This means either:
> +     *
> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow
> +     *    is installed if the actions contains ct().  This is what the
> +     *    kernel datapath does.  It is not so straightforward, though.
> +     *
> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when
> +     *    translating a ct() action.  This is already done in different
> +     *    actions and since both the userspace and the kernel datapath
> +     *    would benefit from it, it seems an appropriate place to do
> +     *    it.
> +     *
> +     * ---
> +     * [1] A simple benchmark (running only the connection tracker
> +     *     over and over on the same packets) shows that if the
> +     *     l3 type is already provided we are 15% faster (running the
> +     *     connection tracker over a couple of DPDK devices with a
> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).
> +     *
> +     * [2] The reasons for this are that keeping the flow increases
> +     *     (slightly) the cache footprint and increases computation
> +     *     time as we move the packet around. Most importantly, the flow
> +     *     should be updated by the actions and this can be slow, as
> +     *     we use a sparse representation (miniflow).
> +     *
> +     */

I discussed the above offline with Jarno; we will always unwildcard
the l3 -- see xlate_wc_init().

Do we have a benchmark on how costly the options are for figuring out
the l4 type?


> +static void
> +conn_key_lookup(struct conntrack *ct,
> +                struct conn_lookup_ctx *ctx,
> +                unsigned bucket,
> +                long long now)
> +{
> +    struct conn *conn, *found = NULL;
> +    uint32_t hash = ctx->hash;
> +    bool reply;
> +
> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) {
> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {
> +            found = conn;
> +            reply = false;
> +            break;
> +        }
> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {
> +            found = conn;
> +            reply = true;
> +            break;
> +        }
> +    }
> +
> +    if (found) {
> +        if (conn_expired(found, now)) {
> +            found = NULL;
> +        } else {
> +            ctx->reply = reply;
> +        }
> +    }

Should this check be within the HMAP_FOR_EACH_WITH_HASH?


Separate question, and maybe something to be followed up on separately
from this series, today with the current userspace + kernel
implementation we can install a flow like:

arp,actions=ct(table=1)

At OpenFlow layer we'll accept this (although maybe we shouldn't)...
When processing an upcall, we'll translate this (though we have enough
information to at least complain if not more)
Then when installing/executing into the kernel datapath, we'll reject
the command while parsing the action.

I'm considering that maybe we should either a) Restrict this all the
way up at OpenFlow, or b) loosen the restriction in kernel datapath
and allow such flows (but mark the traffic as invalid early in
conntrack processing and skip to the next action).

So, to my question: How does the userspace handle a case like this?
Fischetti, Antonio April 25, 2016, 4:14 p.m. UTC | #2
Hi Daniele, 
some comments inline prefixed with [Antonio F].

Regards,
Antonio

> -----Original Message-----

> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of Daniele Di

> Proietto

> Sent: Saturday, April 16, 2016 1:03 AM

> To: dev@openvswitch.org

> Subject: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace connection

> tracker.

> 

> This commit adds the conntrack module.

> 

> It is a connection tracker that resides entirely in userspace.  Its

> primary user will be the dpif-netdev datapath.

> 

> The module main goal is to provide conntrack_execute(), which offers a

> convenient interface to implement the datapath ct() action.

> 

> The conntrack module uses two submodules to deal with the l4 protocol

> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

> 

> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf

> subsystem, therefore it's BSD licensed.  It has been slightly altered to

> match the OVS coding style and to allow the pickup of already

> established connections.

> 

> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

> ---

>  COPYING                 |   1 +

>  debian/copyright.in     |   4 +

>  lib/automake.mk         |   5 +

>  lib/conntrack-other.c   |  91 ++++++

>  lib/conntrack-private.h |  77 +++++

>  lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++

>  lib/conntrack.c         | 851

> ++++++++++++++++++++++++++++++++++++++++++++++++

>  lib/conntrack.h         | 144 ++++++++

>  8 files changed, 1649 insertions(+)

>  create mode 100644 lib/conntrack-other.c

>  create mode 100644 lib/conntrack-private.h

>  create mode 100644 lib/conntrack-tcp.c

>  create mode 100644 lib/conntrack.c

>  create mode 100644 lib/conntrack.h

> 

> diff --git a/COPYING b/COPYING

> index 308e3ea..afb98b9 100644

> --- a/COPYING

> +++ b/COPYING

> @@ -25,6 +25,7 @@ License, version 2.

>  The following files are licensed under the 2-clause BSD license.

>      include/windows/getopt.h

>      lib/getopt_long.c

> +    lib/conntrack-tcp.c

> 

>  The following files are licensed under the 3-clause BSD-license

>      include/windows/netinet/icmp6.h

> diff --git a/debian/copyright.in b/debian/copyright.in

> index 57d007a..a15f4dd 100644

> --- a/debian/copyright.in

> +++ b/debian/copyright.in

> @@ -21,6 +21,9 @@ Upstream Copyright Holders:

>  	Copyright (c) 2014 Michael Chapman

>  	Copyright (c) 2014 WindRiver, Inc.

>  	Copyright (c) 2014 Avaya, Inc.

> +	Copyright (c) 2001 Daniel Hartmeier

> +	Copyright (c) 2002 - 2008 Henning Brauer

> +	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

> 

>  License:

> 

> @@ -90,6 +93,7 @@ License:

>  	lib/getopt_long.c

>  	include/windows/getopt.h

>  	datapath-windows/ovsext/Conntrack-tcp.c

> +	lib/conntrack-tcp.c

> 

>  * The following files are licensed under the 3-clause BSD-license

> 

> diff --git a/lib/automake.mk b/lib/automake.mk

> index 1ec2115..ba30442 100644

> --- a/lib/automake.mk

> +++ b/lib/automake.mk

> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \

>  	lib/compiler.h \

>  	lib/connectivity.c \

>  	lib/connectivity.h \

> +	lib/conntrack-private.h \

> +	lib/conntrack-tcp.c \

> +	lib/conntrack-other.c \

> +	lib/conntrack.c \

> +	lib/conntrack.h \

>  	lib/coverage.c \

>  	lib/coverage.h \

>  	lib/crc32c.c \

> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c

> new file mode 100644

> index 0000000..65d02a9

> --- /dev/null

> +++ b/lib/conntrack-other.c

> @@ -0,0 +1,91 @@

> +/*

> + * Copyright (c) 2015, 2016 Nicira, Inc.

> + *

> + * Licensed under the Apache License, Version 2.0 (the "License");

> + * you may not use this file except in compliance with the License.

> + * You may obtain a copy of the License at:

> + *

> + *     http://www.apache.org/licenses/LICENSE-2.0

> + *

> + * Unless required by applicable law or agreed to in writing, software

> + * distributed under the License is distributed on an "AS IS" BASIS,

> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

> implied.

> + * See the License for the specific language governing permissions and

> + * limitations under the License.

> + */

> +

> +#include <config.h>

> +

> +#include "conntrack-private.h"

> +#include "dp-packet.h"

> +

> +enum other_state {

> +    OTHERS_FIRST,

> +    OTHERS_MULTIPLE,

> +    OTHERS_BIDIR,

> +};

> +

> +struct conn_other {

> +    struct conn up;

> +    enum other_state state;

> +};

> +

> +static const long long other_timeouts[] = {

> +    [OTHERS_FIRST] = 60 * 1000,

> +    [OTHERS_MULTIPLE] = 60 * 1000,

> +    [OTHERS_BIDIR] = 30 * 1000,

> +};

> +

> +static struct conn_other *

> +conn_other_cast(const struct conn *conn)

> +{

> +    return CONTAINER_OF(conn, struct conn_other, up);

> +}

> +

> +static void

> +update_expiration(struct conn_other *conn, long long now)

> +{

> +    conn->up.expiration = now + other_timeouts[conn->state];

> +}

> +

> +static enum ct_update_res

> +other_conn_update(struct conn *conn_, struct dp_packet *pkt

> OVS_UNUSED,

> +                bool reply, long long now)

> +{

> +    struct conn_other *conn = conn_other_cast(conn_);

> +

> +    if (reply && conn->state != OTHERS_BIDIR) {

> +        conn->state = OTHERS_BIDIR;

> +    } else if (conn->state == OTHERS_FIRST) {

> +        conn->state = OTHERS_MULTIPLE;

> +    }

> +

> +    update_expiration(conn, now);

> +

> +    return CT_UPDATE_VALID;

> +}

> +

> +static bool

> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)

> +{

> +    return true;

> +}

> +

> +static struct conn *

> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)

> +{

> +    struct conn_other *conn;

> +

> +    conn = xzalloc(sizeof(struct conn_other));

> +    conn->state = OTHERS_FIRST;

> +

> +    update_expiration(conn, now);

> +

> +    return &conn->up;

> +}

> +

> +struct ct_l4_proto ct_proto_other = {

> +    .new_conn = other_new_conn,

> +    .valid_new = other_valid_new,

> +    .conn_update = other_conn_update,

> +};

> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

> new file mode 100644

> index 0000000..e668c44

> --- /dev/null

> +++ b/lib/conntrack-private.h

> @@ -0,0 +1,77 @@

> +/*

> + * Copyright (c) 2015, 2016 Nicira, Inc.

> + *

> + * Licensed under the Apache License, Version 2.0 (the "License");

> + * you may not use this file except in compliance with the License.

> + * You may obtain a copy of the License at:

> + *

> + *     http://www.apache.org/licenses/LICENSE-2.0

> + *

> + * Unless required by applicable law or agreed to in writing, software

> + * distributed under the License is distributed on an "AS IS" BASIS,

> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

> implied.

> + * See the License for the specific language governing permissions and

> + * limitations under the License.

> + */

> +

> +#ifndef CONNTRACK_PRIVATE_H

> +#define CONNTRACK_PRIVATE_H 1

> +

> +#include <sys/types.h>

> +#include <netinet/in.h>

> +#include <netinet/ip6.h>

> +

> +#include "hmap.h"

> +#include "openvswitch/types.h"

> +#include "packets.h"

> +#include "unaligned.h"

> +

> +struct ct_addr {

> +    union {

> +        ovs_16aligned_be32 ipv4;

> +        union ovs_16aligned_in6_addr ipv6;

> +        ovs_be32 ipv4_aligned;

> +        struct in6_addr ipv6_aligned;

> +    };

> +};

> +

> +struct ct_endpoint {

> +    struct ct_addr addr;

> +    ovs_be16 port;

> +};

> +

> +struct conn_key {

> +    struct ct_endpoint src;

> +    struct ct_endpoint dst;

> +

> +    ovs_be16 dl_type;

> +    uint8_t nw_proto;

> +    uint16_t zone;

> +};

> +

> +struct conn {

> +    struct conn_key key;

> +    struct conn_key rev_key;

> +    long long expiration;

> +    struct hmap_node node;

> +    uint32_t mark;

> +    ovs_u128 label;

> +};

> +

> +enum ct_update_res {

> +    CT_UPDATE_INVALID,

> +    CT_UPDATE_VALID,

> +    CT_UPDATE_NEW,

> +};

> +

> +struct ct_l4_proto {

> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);

> +    bool (*valid_new)(struct dp_packet *pkt);

> +    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet

> *pkt,

> +                                      bool reply, long long now);

> +};

> +

> +extern struct ct_l4_proto ct_proto_tcp;

> +extern struct ct_l4_proto ct_proto_other;

> +

> +#endif /* conntrack-private.h */

> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c

> new file mode 100644

> index 0000000..4d80038

> --- /dev/null

> +++ b/lib/conntrack-tcp.c

> @@ -0,0 +1,476 @@

> +/*-

> + * Copyright (c) 2001 Daniel Hartmeier

> + * Copyright (c) 2002 - 2008 Henning Brauer

> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

> + * Copyright (c) 2015, 2016 Nicira, Inc.

> + * All rights reserved.

> + *

> + * Redistribution and use in source and binary forms, with or without

> + * modification, are permitted provided that the following conditions

> + * are met:

> + *

> + *    - Redistributions of source code must retain the above copyright

> + *      notice, this list of conditions and the following disclaimer.

> + *    - Redistributions in binary form must reproduce the above

> + *      copyright notice, this list of conditions and the following

> + *      disclaimer in the documentation and/or other materials provided

> + *      with the distribution.

> + *

> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

> CONTRIBUTORS

> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

> FITNESS

> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,

> INDIRECT,

> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

> (INCLUDING,

> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

> SERVICES;

> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

> STRICT

> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

> + * POSSIBILITY OF SUCH DAMAGE.

> + *

> + * Effort sponsored in part by the Defense Advanced Research Projects

> + * Agency (DARPA) and Air Force Research Laboratory, Air Force

> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.

> + *

> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $

> + */

> +

> +#include <config.h>

> +

> +#include "conntrack-private.h"

> +#include "ct-dpif.h"

> +#include "dp-packet.h"

> +#include "util.h"

> +

> +struct tcp_peer {

> +    enum ct_dpif_tcp_state state;

> +    uint32_t               seqlo;          /* Max sequence number sent     */

> +    uint32_t               seqhi;          /* Max the other end ACKd + win */

> +    uint16_t               max_win;        /* largest window (pre scaling) */

> +    uint8_t                wscale;         /* window scaling factor        */

> +};

> +

> +struct conn_tcp {

> +    struct conn up;

> +    struct tcp_peer peer[2];

> +};

> +

> +enum {

> +    TCPOPT_EOL,

> +    TCPOPT_NOP,

> +    TCPOPT_WINDOW = 3,

> +};

> +

> +/* TCP sequence numbers are 32 bit integers operated

> + * on with modular arithmetic.  These macros can be

> + * used to compare such integers. */

> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)

> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)

> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)

> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)

> +

> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))

> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))

> +

> +static struct conn_tcp*

> +conn_tcp_cast(const struct conn* conn)

> +{

> +    return CONTAINER_OF(conn, struct conn_tcp, up);

> +}

> +

> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub

> + * is enabled.  We're not scrubbing, but this check seems reasonable.  */

> +static bool

> +tcp_invalid_flags(uint16_t flags)

> +{

> +

> +    if (flags & TCP_SYN) {

> +        if (flags & TCP_RST) {

> +            return true;

> +        }

> +        if (flags & TCP_FIN) {

> +            /* Here pf removes the fin flag.  We simply mark the packet as

> +             * invalid */

> +            return true;

> +        }

> +    } else {

> +        /* Illegal packet */

> +        if (!(flags & (TCP_ACK|TCP_RST))) {

> +            return true;

> +        }

> +    }

> +

> +    if (!(flags & TCP_ACK)) {

> +        /* These flags are only valid if ACK is set */

> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {

> +            return true;

> +        }

> +    }

> +

> +    return false;

> +}

> +

> +#define TCP_MAX_WSCALE 14

> +#define CT_WSCALE_FLAG 0x80

> +#define CT_WSCALE_UNKNOWN 0x40

> +#define CT_WSCALE_MASK 0xf

> +

> +static uint8_t

> +tcp_get_wscale(const struct tcp_header *tcp)

> +{

> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;

> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);

> +    uint8_t wscale = 0;

> +    uint8_t optlen;

> +

> +    while (len >= 3) {

> +        if (*opt == TCPOPT_EOL) {

> +            break;

> +        }

> +        switch (*opt) {

> +        case TCPOPT_NOP:

> +            opt++;

> +            len--;

> +            break;

> +        case TCPOPT_WINDOW:

> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);

> +            wscale |= CT_WSCALE_FLAG;

> +            /* fall through */

> +        default:

> +            optlen = opt[1];

> +            if (optlen < 2) {

> +                optlen = 2;

> +            }

> +            len -= optlen;

> +            opt += optlen;

> +        }

> +    }

> +

> +    return wscale;

> +}

> +

> +static uint32_t

> +tcp_payload_length(struct dp_packet *pkt)

> +{

> +    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)

> +           - (char *) dp_packet_get_tcp_payload(pkt);

> +}

> +

> +static void

> +update_expiration(struct conn_tcp *conn, long long now, long long interval)

> +{

> +    conn->up.expiration = now + interval;

> +}

> +

> +

> +static enum ct_update_res

> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,

> +                long long now)

> +{

> +    struct conn_tcp *conn = conn_tcp_cast(conn_);

> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> +    /* The peer that sent 'pkt' */

> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];

> +    /* The peer that should receive 'pkt' */

> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];

> +    uint8_t sws = 0, dws = 0;

> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> +

> +    uint16_t win = ntohs(tcp->tcp_winsz);

> +    uint32_t ack, end, seq, orig_seq;

> +    uint32_t p_len = tcp_payload_length(pkt);

> +    int ackskew;

> +

> +    if (tcp_invalid_flags(tcp_flags)) {

> +        return CT_UPDATE_INVALID;

> +    }

> +

> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&

> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&

> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;

> +        return CT_UPDATE_NEW;

> +    }

> +

> +    if (src->wscale & CT_WSCALE_FLAG

> +        && dst->wscale & CT_WSCALE_FLAG

> +        && !(tcp_flags & TCP_SYN)) {

> +

> +        sws = src->wscale & CT_WSCALE_MASK;

> +        dws = dst->wscale & CT_WSCALE_MASK;

> +

> +    } else if (src->wscale & CT_WSCALE_UNKNOWN

> +        && dst->wscale & CT_WSCALE_UNKNOWN

> +        && !(tcp_flags & TCP_SYN)) {

> +

> +        sws = TCP_MAX_WSCALE;

> +        dws = TCP_MAX_WSCALE;

> +    }

> +

> +    /*

> +     * Sequence tracking algorithm from Guido van Rooij's paper:

> +     *   http://www.madison-gurkha.com/publications/tcp_filtering/

> +     *      tcp_filtering.ps

> +     */

> +

> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));

> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {

> +        /* First packet from this end. Set its state */

> +

> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

> +

> +        end = seq + p_len;

> +        if (tcp_flags & TCP_SYN) {

> +            end++;

> +            if (dst->wscale & CT_WSCALE_FLAG) {

> +                src->wscale = tcp_get_wscale(tcp);

> +                if (src->wscale & CT_WSCALE_FLAG) {

> +                    /* Remove scale factor from initial window */

> +                    sws = src->wscale & CT_WSCALE_MASK;

> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);

> +                    dws = dst->wscale & CT_WSCALE_MASK;

> +                } else {

> +                    /* fixup other window */

> +                    dst->max_win <<= dst->wscale &

> +                        CT_WSCALE_MASK;

> +                    /* in case of a retrans SYN|ACK */

> +                    dst->wscale = 0;

> +                }

> +            }

> +        }

> +        if (tcp_flags & TCP_FIN) {

> +            end++;

> +        }

> +

> +        src->seqlo = seq;

> +        src->state = CT_DPIF_TCPS_SYN_SENT;

> +        /*

> +         * May need to slide the window (seqhi may have been set by

> +         * the crappy stack check or if we picked up the connection

> +         * after establishment)

> +         */

> +        if (src->seqhi == 1 ||

> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {

> +            src->seqhi = end + MAX(1, dst->max_win << dws);

> +        }

> +        if (win > src->max_win) {

> +            src->max_win = win;

> +        }

> +

> +    } else {

> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

> +        end = seq + p_len;

> +        if (tcp_flags & TCP_SYN) {

> +            end++;

> +        }

> +        if (tcp_flags & TCP_FIN) {

> +            end++;

> +        }

> +    }

> +

> +    if ((tcp_flags & TCP_ACK) == 0) {

> +        /* Let it pass through the ack skew check */

> +        ack = dst->seqlo;

> +    } else if ((ack == 0

> +                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))

> +               /* broken tcp stacks do not set ack */) {

> +        /* Many stacks (ours included) will set the ACK number in an

> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */

> +        ack = dst->seqlo;

> +    }

> +

> +    if (seq == end) {

> +        /* Ease sequencing restrictions on no data packets */

> +        seq = src->seqlo;

> +        end = seq;

> +    }

> +

> +    ackskew = dst->seqlo - ack;

> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge

> factor */

> +    if (SEQ_GEQ(src->seqhi, end)

> +        /* Last octet inside other's window space */

> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))

> +        /* Retrans: not more than one window back */

> +        && (ackskew >= -MAXACKWINDOW)

> +        /* Acking not more than one reassembled fragment backwards */

> +        && (ackskew <= (MAXACKWINDOW << sws))

> +        /* Acking not more than one window forward */

> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo

> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {

> +        /* Require an exact/+1 sequence match on resets when possible */

> +

> +        /* update max window */

> +        if (src->max_win < win) {

> +            src->max_win = win;

> +        }

> +        /* synchronize sequencing */

> +        if (SEQ_GT(end, src->seqlo)) {

> +            src->seqlo = end;

> +        }

> +        /* slide the window of what the other end can send */

> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

> +            dst->seqhi = ack + MAX((win << sws), 1);

> +        }

> +

> +        /* update states */

> +        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {

> +                src->state = CT_DPIF_TCPS_SYN_SENT;

> +        }

> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

> +                src->state = CT_DPIF_TCPS_CLOSING;

> +        }

> +        if (tcp_flags & TCP_ACK) {

> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {

> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;

> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {

> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;

> +            }

> +        }

> +        if (tcp_flags & TCP_RST) {

> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

> +        }

> +

> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2

> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

> +            update_expiration(conn, now, 30 * 1000);

> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {

> +            update_expiration(conn, now, 45 * 1000);

> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED

> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {

> +            update_expiration(conn, now, 30 * 1000);

> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {

> +            update_expiration(conn, now, 15 * 60 * 1000);

> +        } else {

> +            update_expiration(conn, now, 24 * 60 * 60 * 1000);

> +        }

> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT

> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2

> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)

> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)

> +               /* Within a window forward of the originating packet */

> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {

> +               /* Within a window backward of the originating packet */

> +

> +        /*

> +         * This currently handles three situations:

> +         *  1) Stupid stacks will shotgun SYNs before their peer

> +         *     replies.

> +         *  2) When PF catches an already established stream (the

> +         *     firewall rebooted, the state table was flushed, routes

> +         *     changed...)

> +         *  3) Packets get funky immediately after the connection

> +         *     closes (this should catch Solaris spurious ACK|FINs

> +         *     that web servers like to spew after a close)

> +         *

> +         * This must be a little more careful than the above code

> +         * since packet floods will also be caught here. We don't

> +         * update the TTL here to mitigate the damage of a packet

> +         * flood and so the same code can handle awkward establishment

> +         * and a loosened connection close.

> +         * In the establishment case, a correct peer response will

> +         * validate the connection, go through the normal state code

> +         * and keep updating the state TTL.

> +         */

> +

> +        /* update max window */

> +        if (src->max_win < win) {

> +            src->max_win = win;

> +        }

> +        /* synchronize sequencing */

> +        if (SEQ_GT(end, src->seqlo)) {

> +            src->seqlo = end;

> +        }

> +        /* slide the window of what the other end can send */

> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

> +            dst->seqhi = ack + MAX((win << sws), 1);

> +        }

> +

> +        /*

> +         * Cannot set dst->seqhi here since this could be a shotgunned

> +         * SYN and not an already established connection.

> +         */

> +

> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

> +            src->state = CT_DPIF_TCPS_CLOSING;

> +        }

> +

> +        if (tcp_flags & TCP_RST) {

> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

> +        }

> +    } else {

> +        return CT_UPDATE_INVALID;

> +    }

> +

> +    return CT_UPDATE_VALID;

> +}

> +

> +static bool

> +tcp_valid_new(struct dp_packet *pkt)

> +{

> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> +

> +    if (tcp_invalid_flags(tcp_flags)) {

> +        return false;

> +    }

> +

> +    /* A syn+ack is not allowed to create a connection.  We want to allow

> +     * totally new connections (syn) or already established, not partially

> +     * open (syn+ack). */

> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {

> +        return false;

> +    }

> +

> +    return true;

> +}

> +

> +static struct conn *

> +tcp_new_conn(struct dp_packet *pkt, long long now)

> +{

> +    struct conn_tcp* newconn = NULL;

> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> +    struct tcp_peer *src, *dst;

> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> +

> +    newconn = xzalloc(sizeof(struct conn_tcp));

> +

> +    src = &newconn->peer[0];

> +    dst = &newconn->peer[1];

> +

> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));

> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;

> +

> +    if (tcp_flags & TCP_SYN) {

> +        src->seqhi++;

> +        src->wscale = tcp_get_wscale(tcp);

> +    } else {

> +        src->wscale = CT_WSCALE_UNKNOWN;

> +        dst->wscale = CT_WSCALE_UNKNOWN;

> +    }

> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);

> +    if (src->wscale & CT_WSCALE_MASK) {

> +        /* Remove scale factor from initial window */

> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;

> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);

> +    }

> +    if (tcp_flags & TCP_FIN) {

> +        src->seqhi++;

> +    }

> +    dst->seqhi = 1;

> +    dst->max_win = 1;

> +    src->state = CT_DPIF_TCPS_SYN_SENT;

> +    dst->state = CT_DPIF_TCPS_CLOSED;

> +

> +    update_expiration(newconn, now, 30 * 1000);

> +

> +    return &newconn->up;

> +}

> +

> +struct ct_l4_proto ct_proto_tcp = {

> +    .new_conn = tcp_new_conn,

> +    .valid_new = tcp_valid_new,

> +    .conn_update = tcp_conn_update,

> +};

> diff --git a/lib/conntrack.c b/lib/conntrack.c

> new file mode 100644

> index 0000000..840335b

> --- /dev/null

> +++ b/lib/conntrack.c

> @@ -0,0 +1,851 @@

> +/*

> + * Copyright (c) 2015, 2016 Nicira, Inc.

> + *

> + * Licensed under the Apache License, Version 2.0 (the "License");

> + * you may not use this file except in compliance with the License.

> + * You may obtain a copy of the License at:

> + *

> + *     http://www.apache.org/licenses/LICENSE-2.0

> + *

> + * Unless required by applicable law or agreed to in writing, software

> + * distributed under the License is distributed on an "AS IS" BASIS,

> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

> implied.

> + * See the License for the specific language governing permissions and

> + * limitations under the License.

> + */

> +

> +#include <config.h>

> +#include "conntrack.h"

> +

> +#include <errno.h>

> +#include <sys/types.h>

> +#include <netinet/in.h>

> +#include <netinet/icmp6.h>

> +

> +#include "bitmap.h"

> +#include "conntrack-private.h"

> +#include "dp-packet.h"

> +#include "flow.h"

> +#include "hmap.h"

> +#include "netdev.h"

> +#include "odp-netlink.h"

> +#include "openvswitch/vlog.h"

> +#include "ovs-rcu.h"

> +#include "random.h"

> +#include "timeval.h"

> +

> +VLOG_DEFINE_THIS_MODULE(conntrack);

> +

> +struct conn_lookup_ctx {

> +    struct conn_key key;

> +    struct conn *conn;

> +    uint32_t hash;

> +    bool reply;

> +    bool related;

> +};

> +

> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,

> +                             struct conn_lookup_ctx *, uint16_t zone);

> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);

> +static void conn_key_reverse(struct conn_key *);

> +static void conn_key_lookup(struct conntrack *ct,

> +                            struct conn_lookup_ctx *ctx,

> +                            unsigned bucket,

> +                            long long now);

> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);

> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,

> +                             long long now);

> +static void delete_conn(struct conn *);

> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*,

> +                                      bool reply, long long now);

> +static bool conn_expired(struct conn *, long long now);

> +static void set_mark(struct dp_packet *, struct conn *,

> +                     uint32_t val, uint32_t mask);

> +static void set_label(struct dp_packet *, struct conn *,

> +                      const struct ovs_key_ct_labels *val,

> +                      const struct ovs_key_ct_labels *mask);

> +

> +static struct ct_l4_proto *l4_protos[] = {

> +    [IPPROTO_TCP] = &ct_proto_tcp,

> +    [IPPROTO_UDP] = &ct_proto_other,

> +    [IPPROTO_ICMP] = &ct_proto_other,

> +};

> +

> +/* Initializes the connection tracker 'ct'.  The caller is responbile for

> + * calling 'conntrack_destroy()', when the instance is not needed anymore */

> +void

> +conntrack_init(struct conntrack *ct)

> +{

> +    unsigned i;

> +

> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

> +        ct_lock_init(&ct->locks[i]);

> +        ct_lock_lock(&ct->locks[i]);

> +        hmap_init(&ct->connections[i]);

> +        ct_lock_unlock(&ct->locks[i]);

> +    }

> +    ct->hash_basis = random_uint32();

> +    ct->purge_bucket = 0;

> +    ct->purge_inner_bucket = 0;

> +    ct->purge_inner_offset = 0;

> +}

> +

> +/* Destroys the connection tracker 'ct' and frees all the allocated memory. */

> +void

> +conntrack_destroy(struct conntrack *ct)

> +{

> +    unsigned i;

> +

> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

> +        struct conn *conn, *next;

> +

> +        ct_lock_lock(&ct->locks[i]);

> +        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {

> +            hmap_remove(&ct->connections[i], &conn->node);

> +            delete_conn(conn);

> +        }

> +        hmap_destroy(&ct->connections[i]);

> +        ct_lock_unlock(&ct->locks[i]);

> +        ct_lock_destroy(&ct->locks[i]);

> +    }

> +}

> +


> +static unsigned hash_to_bucket(uint32_t hash)

> +{

> +    /* Extracts the most significant bits in hash. The least significant bits

> +     * are already used internally by the hmap implementation. */

> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 &&

> CONNTRACK_BUCKETS_SHIFT >= 1);

> +

> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) %

> CONNTRACK_BUCKETS;

> +}

> +

> +static void

> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,

> +            uint32_t mark, ovs_u128 label)

> +{

> +    pkt->md.ct_state = state | CS_TRACKED;

> +    pkt->md.ct_zone = zone;

> +    pkt->md.ct_mark = mark;

> +    pkt->md.ct_label = label;

> +}

> +

> +static struct conn *

> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,

> +               long long now)

> +{

> +    unsigned bucket = hash_to_bucket(ctx->hash);

> +    struct conn *nc = NULL;

> +

> +    if (!valid_new(pkt, &ctx->key)) {

> +        *state |= CS_INVALID;

> +        return nc;

> +    }

> +

> +    *state |= CS_NEW;

> +

> +    if (commit) {

> +        nc = new_conn(pkt, &ctx->key, now);

> +

> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);

> +

> +        conn_key_reverse(&nc->rev_key);

> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);

> +    }

> +

> +    return nc;

> +}

> +

> +static struct conn *

> +process_one(struct conntrack *ct, struct dp_packet *pkt,

> +            struct conn_lookup_ctx *ctx, uint16_t zone,

> +            bool commit, long long now)

> +{

> +    unsigned bucket = hash_to_bucket(ctx->hash);

> +    struct conn *conn = ctx->conn;

> +    uint8_t state = 0;

> +

> +    if (conn) {

> +        if (ctx->related) {

> +            state |= CS_RELATED;

> +            if (ctx->reply) {

> +                state |= CS_REPLY_DIR;

> +            }

> +        } else {

> +            enum ct_update_res res;

> +

> +            res = conn_update(conn, pkt, ctx->reply, now);

> +

> +            switch (res) {

> +            case CT_UPDATE_VALID:

> +                state |= CS_ESTABLISHED;

> +                if (ctx->reply) {

> +                    state |= CS_REPLY_DIR;

> +                }

> +                break;

> +            case CT_UPDATE_INVALID:

> +                state |= CS_INVALID;

> +                break;

> +            case CT_UPDATE_NEW:

> +                hmap_remove(&ct->connections[bucket], &conn->node);

> +                delete_conn(conn);

> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

> +                break;


[Antonio F]
I see that conn_update can return just CT_UPDATE_VALID/INVALID/NEW
but should we consider a 'default' case here, eg
default:
    state |= CS_INVALID;
    break;
?


> +            }

> +        }

> +

> +        pkt->md.ct_label = conn->label;

> +        pkt->md.ct_mark = conn->mark;


[Antonio F]
Can we skip the previous 2 lines as they are implemented into write_ct_md ?


> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

> +    } else {

> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

> +    }

> +

> +    return conn;

> +}

> +

> +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker

> + * 'ct'.  If 'commit' is true, the packets are allowed to create new entries

> + * in the connection tables.  'setmark', if not NULL, should point to a two

> + * elements array containing a value and a mask to set the connection mark.

> + * 'setlabel' behaves similarly for the connection label.*/

> +int

> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,

> +                  bool commit, uint16_t zone, const uint32_t *setmark,

> +                  const struct ovs_key_ct_labels *setlabel,

> +                  const char *helper)

> +{

> +#if !defined(__CHECKER__) && !defined(_WIN32)

> +    const size_t KEY_ARRAY_SIZE = cnt;

> +#else

> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };

> +#endif

> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];

> +    int8_t bucket_list[CONNTRACK_BUCKETS];

> +    struct {

> +        unsigned bucket;

> +        unsigned long maps;

> +    } arr[KEY_ARRAY_SIZE];

> +    long long now = time_msec();

> +    size_t i = 0;

> +    uint8_t arrcnt = 0;

> +

> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >=

> NETDEV_MAX_BURST);

> +

> +    if (helper) {

> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);

> +

> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);

> +        /* Continue without the helper */

> +    }

> +

> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);

> +    for (i = 0; i < cnt; i++) {

> +        unsigned bucket;

> +

> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {

> +            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});

> +            continue;

> +        }

> +

> +        bucket = hash_to_bucket(ctxs[i].hash);

> +        if (bucket_list[bucket] == INT8_C(-1)) {

> +            bucket_list[bucket] = arrcnt;

> +

> +            arr[arrcnt].maps = 0;

> +            ULLONG_SET1(arr[arrcnt].maps, i);

> +            arr[arrcnt++].bucket = bucket;

> +        } else {

> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);

> +            arr[bucket_list[bucket]].maps |= 1UL << i;

> +        }

> +    }

> +

> +    for (i = 0; i < arrcnt; i++) {

> +        size_t j;

> +

> +        ct_lock_lock(&ct->locks[arr[i].bucket]);

> +

> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {

> +            struct conn *conn;

> +

> +            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);

> +

> +            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);

> +

> +            if (conn && setmark) {

> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);

> +            }

> +

> +            if (conn && setlabel) {

> +                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);

> +            }

> +        }

> +        ct_lock_unlock(&ct->locks[arr[i].bucket]);

> +    }

> +

> +    return 0;

> +}

> +

> +static void

> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t

> mask)

> +{

> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));

> +    conn->mark = pkt->md.ct_mark;

> +}

> +

> +static void

> +set_label(struct dp_packet *pkt, struct conn *conn,

> +          const struct ovs_key_ct_labels *val,

> +          const struct ovs_key_ct_labels *mask)

> +{

> +    ovs_u128 v, m;

> +

> +    memcpy(&v, val, sizeof v);

> +    memcpy(&m, mask, sizeof m);

> +

> +    pkt->md.ct_label.u64.lo = v.u64.lo

> +                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));

> +    pkt->md.ct_label.u64.hi = v.u64.hi

> +                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));

> +    conn->label = pkt->md.ct_label;

> +}

> +


> +#define CONNTRACK_PURGE_NUM 256

> +

> +static void

> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,

> +             uint32_t *inner_offset, unsigned *left, long long now)

> +{

> +    while (*left != 0) {

> +        struct hmap_node *node;

> +        struct conn *conn;

> +

> +        node = hmap_at_position(bucket, inner_bucket, inner_offset);

> +

> +        if (!node) {

> +            hmap_shrink(bucket);

> +            break;

> +        }

> +

> +        INIT_CONTAINER(conn, node, node);

> +        if (conn_expired(conn, now)) {

> +            hmap_remove(bucket, &conn->node);

> +            delete_conn(conn);

> +            (*left)--;

> +        }

> +    }

> +}

> +

> +/* Cleans up old connection entries.  Should be called periodically. */

> +void

> +conntrack_run(struct conntrack *ct)

> +{

> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);

> +    uint32_t inner_bucket = ct->purge_inner_bucket,

> +             inner_offset = ct->purge_inner_offset;

> +    unsigned left = CONNTRACK_PURGE_NUM;

> +    long long now = time_msec();

> +

> +    while (bucket < CONNTRACK_BUCKETS) {

> +        ct_lock_lock(&ct->locks[bucket]);

> +        sweep_bucket(&ct->connections[bucket],

> +                     &inner_bucket, &inner_offset,

> +                     &left, now);

> +        ct_lock_unlock(&ct->locks[bucket]);

> +

> +        if (left == 0) {

> +            break;

> +        } else {

> +            bucket++;

> +        }

> +    }

> +

> +    ct->purge_bucket = bucket;

> +    ct->purge_inner_bucket = inner_bucket;

> +    ct->purge_inner_offset = inner_offset;

> +}

> +


> +/* Key extraction */

> +

> +/* The function stores a pointer to the first byte after the header in

> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

> + * not interested in the header's tail,  meaning that the header has

> + * already been parsed (e.g. by flow_extract): we take this as a hint to

> + * save a few checks. */

> +static inline bool

> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,

> +                const char **new_data)

> +{

> +    const struct ip_header *ip = data;

> +

> +    if (new_data) {

> +        size_t ip_len;

> +

> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {

> +            return false;

> +        }

> +        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;

> +

> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {

> +            return false;

> +        }

> +        if (OVS_UNLIKELY(size < ip_len)) {

> +            return false;

> +        }

> +

> +        *new_data = (char *) data + ip_len;

> +    }

> +

> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {

> +        return false;

> +    }

> +

> +    key->src.addr.ipv4 = ip->ip_src;

> +    key->dst.addr.ipv4 = ip->ip_dst;

> +    key->nw_proto = ip->ip_proto;

> +

> +    return true;

> +}

> +

> +/* The function stores a pointer to the first byte after the header in

> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

> + * not interested in the header's tail,  meaning that the header has

> + * already been parsed (e.g. by flow_extract): we take this as a hint to

> + * save a few checks. */

> +static inline bool

> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,

> +                const char **new_data)

> +{

> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;

> +    uint8_t nw_proto = ip6->ip6_nxt;

> +    uint8_t nw_frag = 0;

> +

> +    if (new_data) {

> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {

> +            return false;

> +        }

> +    }

> +

> +    data = ip6 + 1;

> +    size -=  sizeof *ip6;

> +

> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {

> +        return false;

> +    }

> +

> +    if (new_data) {

> +        *new_data = data;

> +    }

> +

> +    if (nw_frag) {

> +        return false;

> +    }

> +

> +    key->src.addr.ipv6 = ip6->ip6_src;

> +    key->dst.addr.ipv6 = ip6->ip6_dst;

> +    key->nw_proto = nw_proto;

> +

> +    return true;

> +}

> +

> +static inline bool

> +check_l4_tcp(const void *data, size_t size)

> +{

> +    const struct tcp_header *tcp = data;

> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;

> +

> +    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {

> +        return true;

> +    }

> +

> +    return false;

> +}

> +

> +static inline bool

> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)

> +{

> +    const struct tcp_header *tcp = data;

> +

> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {

> +        return false;

> +    }

> +

> +    key->src.port = tcp->tcp_src;

> +    key->dst.port = tcp->tcp_dst;

> +

> +    return true;

> +}

> +

> +static inline bool

> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)

> +{

> +    const struct udp_header *udp = data;

> +

> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {

> +        return false;

> +    }

> +

> +    key->src.port = udp->udp_src;

> +    key->dst.port = udp->udp_dst;

> +

> +    return true;

> +}

> +

> +static inline bool extract_l4(struct conn_key *key, const void *data,

> +                              size_t size, bool *related);

> +

> +/* If 'related' is not NULL and the function is processing an ICMP

> + * error packet, extract the l3 and l4 fields from the nested header

> + * instead and set *related to true.  If 'related' is NULL we're

> + * already processing a nested header and no such recursion is

> + * possible */

> +static inline int

> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

> +                bool *related)

> +{

> +    const struct icmp_header *icmp = data;

> +

> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {

> +        return false;

> +    }

> +

> +    switch (icmp->icmp_type) {

> +    case ICMP4_ECHO_REQUEST:

> +    case ICMP4_ECHO_REPLY:

> +    case ICMP4_TIMESTAMP:

> +    case ICMP4_TIMESTAMPREPLY:

> +    case ICMP4_INFOREQUEST:

> +    case ICMP4_INFOREPLY:

> +        /* Separate ICMP connection: identified using id */

> +        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;

> +        break;

> +    case ICMP4_DST_UNREACH:

> +    case ICMP4_TIME_EXCEEDED:

> +    case ICMP4_PARAM_PROB:

> +    case ICMP4_SOURCEQUENCH:

> +    case ICMP4_REDIRECT: {

> +        /* ICMP packet part of another connection. We should

> +         * extract the key from embedded packet header */

> +        struct conn_key inner_key;

> +        const char *l3 = (const char *) (icmp + 1);

> +        const char *tail = (const char *) data + size;

> +        const char *l4;

> +        bool ok;

> +

> +        if (!related) {

> +            return false;

> +        }

> +        *related = true;

> +

> +        memset(&inner_key, 0, sizeof inner_key);

> +        inner_key.dl_type = htons(ETH_TYPE_IP);

> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);

> +        if (!ok) {

> +            return false;

> +        }

> +

> +        /* pf doesn't do this, but it seems a good idea */

> +        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned

> +            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {

> +            return false;

> +        }

> +

> +        key->src = inner_key.src;

> +        key->dst = inner_key.dst;

> +        key->nw_proto = inner_key.nw_proto;

> +

> +        ok = extract_l4(key, l4, tail - l4, NULL);

> +        if (ok) {

> +            conn_key_reverse(key);

> +        }

> +        return ok;

> +    }

> +    default:

> +        return false;

> +    }

> +

> +    return true;

> +}

> +

> +/* If 'related' is not NULL and the function is processing an ICMP

> + * error packet, extract the l3 and l4 fields from the nested header

> + * instead and set *related to true.  If 'related' is NULL we're

> + * already processing a nested header and no such recursion is

> + * possible */

> +static inline bool

> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

> +                 bool *related)

> +{

> +    const struct icmp6_header *icmp6 = data;

> +

> +    /* All the messages that we support need at least 4 bytes after

> +     * the header */

> +    if (size < sizeof *icmp6 + 4) {

> +        return false;

> +    }

> +

> +    switch (icmp6->icmp6_type) {

> +    case ICMP6_ECHO_REQUEST:

> +    case ICMP6_ECHO_REPLY:

> +        /* Separate ICMP connection: identified using id */

> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);

> +        break;

> +    case ICMP6_DST_UNREACH:

> +    case ICMP6_PACKET_TOO_BIG:

> +    case ICMP6_TIME_EXCEEDED:

> +    case ICMP6_PARAM_PROB:{

> +        /* ICMP packet part of another connection. We should

> +         * extract the key from embedded packet header */

> +        struct conn_key inner_key;

> +        const char *l3 = (const char *) icmp6 + 8;

> +        const char *tail = (const char *) data + size;

> +        const char *l4 = NULL;

> +        bool ok;

> +

> +        if (!related) {

> +            return false;

> +        }

> +        *related = true;

> +

> +        memset(&inner_key, 0, sizeof inner_key);

> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);

> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);

> +        if (!ok) {

> +            return false;

> +        }

> +

> +        /* pf doesn't do this, but it seems a good idea */

> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,

> +                              &key->dst.addr.ipv6_aligned)

> +            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,

> +                                 &key->src.addr.ipv6_aligned)) {

> +            return false;

> +        }

> +

> +        key->src = inner_key.src;

> +        key->dst = inner_key.dst;

> +        key->nw_proto = inner_key.nw_proto;

> +

> +        ok = extract_l4(key, l4, tail - l4, NULL);

> +        if (ok) {

> +            conn_key_reverse(key);

> +        }

> +        return ok;

> +    }

> +    default:

> +        return false;

> +    }

> +

> +    return true;

> +}

> +

> +/* Extract l4 fields into 'key', which must already contain valid l3

> + * members.  If 'related' is not NULL and an ICMP error packet is being

> + * processed, the function will extract the key from the packet nested

> + * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,

> + * nested parsing isn't allowed.  This is necessary to limit the

> + * recursion level. */

> +static inline bool

> +extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)

> +{

> +    if (key->nw_proto == IPPROTO_TCP) {

> +        return extract_l4_tcp(key, data, size)

> +               && (!related || check_l4_tcp(data, size));

> +    } else if (key->nw_proto == IPPROTO_UDP) {

> +        return extract_l4_udp(key, data, size);

> +    } else if (key->dl_type == htons(ETH_TYPE_IP)

> +               && key->nw_proto == IPPROTO_ICMP) {

> +        return extract_l4_icmp(key, data, size, related);

> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)

> +               && key->nw_proto == IPPROTO_ICMPV6) {

> +        return extract_l4_icmp6(key, data, size, related);

> +    } else {

> +        return false;

> +    }

> +}

> +

> +static bool

> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,

> +                 struct conn_lookup_ctx *ctx, uint16_t zone)

> +{

> +    const struct eth_header *l2 = dp_packet_l2(pkt);

> +    const struct ip_header *l3 = dp_packet_l3(pkt);

> +    const char *l4 = dp_packet_l4(pkt);

> +    const char *tail = dp_packet_tail(pkt);

> +    bool ok;

> +

> +    memset(ctx, 0, sizeof *ctx);

> +

> +    if (!l2 || !l3 || !l4) {

> +        return false;

> +    }

> +

> +    ctx->key.zone = zone;

> +

> +    /* XXX In this function we parse the packet (again, it has already

> +     * gone through miniflow_extract()) for two reasons:

> +     *

> +     * 1) To extract the l3 addresses and l4 ports.

> +     *    We already have the l3 and l4 headers' pointers.  Extracting

> +     *    the l3 addresses and the l4 ports is really cheap, since they

> +     *    can be found at fixed locations.

> +     * 2) To extract the l3 and l4 types.

> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the

> +     *    other hand is quite expensive, because they're not at a

> +     *    fixed location.

> +     *

> +     * Here's a way to avoid (2) with the help of the datapath.

> +     * The datapath doesn't keep the packet's extracted flow[2], so

> +     * using that is not an option.  We could use the packet's matching

> +     * megaflow, but we have to make sure that the l3 and l4 types

> +     * are unwildcarded.  This means either:

> +     *

> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow

> +     *    is installed if the actions contains ct().  This is what the

> +     *    kernel datapath does.  It is not so straightforward, though.

> +     *

> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when

> +     *    translating a ct() action.  This is already done in different

> +     *    actions and since both the userspace and the kernel datapath

> +     *    would benefit from it, it seems an appropriate place to do

> +     *    it.

> +     *

> +     * ---

> +     * [1] A simple benchmark (running only the connection tracker

> +     *     over and over on the same packets) shows that if the

> +     *     l3 type is already provided we are 15% faster (running the

> +     *     connection tracker over a couple of DPDK devices with a

> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).

> +     *

> +     * [2] The reasons for this are that keeping the flow increases

> +     *     (slightly) the cache footprint and increases computation

> +     *     time as we move the packet around. Most importantly, the flow

> +     *     should be updated by the actions and this can be slow, as

> +     *     we use a sparse representation (miniflow).

> +     *

> +     */

> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);

> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {


[Antonio F] In case we specify a flow with VLAN + CT like
add_flow brX table=0,in_port=1,vlan_tci=0xf123,tcp,ct_state=-trk,action=ct(commit,zone=9),2
should we also consider ETH_TYPE_VLAN in the previous conditional statement?


> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);

> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {

> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);

> +    } else {

> +        ok = false;

> +    }

> +

> +    if (ok) {

> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {

> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);

> +            return true;

> +        }

> +    }

> +

> +    return false;

> +}

> +


> +/* Symmetric */

> +static uint32_t

> +conn_key_hash(const struct conn_key *key, uint32_t basis)

> +{

> +    uint32_t hsrc, hdst, hash;

> +    int i;

> +

> +    hsrc = hdst = basis;

> +

> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {

> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);

> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);

> +    }

> +

> +    hash = hsrc ^ hdst;

> +

> +    hash = hash_words((uint32_t *) &key->dst + 1,

> +                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),

> +                      hash);


[Antonio F]
Is that to involve dl_type, nw_proto and zone in the hash computation?


> +

> +    return hash;

> +}

> +

> +static void

> +conn_key_reverse(struct conn_key *key)

> +{

> +    struct ct_endpoint tmp;

> +    tmp = key->src;

> +    key->src = key->dst;

> +    key->dst = tmp;

> +}

> +

> +static void

> +conn_key_lookup(struct conntrack *ct,

> +                struct conn_lookup_ctx *ctx,

> +                unsigned bucket,

> +                long long now)

> +{

> +    struct conn *conn, *found = NULL;

> +    uint32_t hash = ctx->hash;

> +    bool reply;

> +

> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct-

> >connections[bucket]) {

> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {

> +            found = conn;

> +            reply = false;

> +            break;

> +        }

> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {

> +            found = conn;

> +            reply = true;

> +            break;

> +        }

> +    }

> +

> +    if (found) {

> +        if (conn_expired(found, now)) {

> +            found = NULL;

> +        } else {

> +            ctx->reply = reply;

> +        }

> +    }

> +

> +    ctx->conn = found;

> +}

> +

> +static enum ct_update_res

> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,

> +            long long now)

> +{

> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply,

> now);

> +}

> +

> +static bool

> +conn_expired(struct conn *conn, long long now)

> +{

> +    return now >= conn->expiration;

> +}

> +

> +static bool

> +valid_new(struct dp_packet *pkt, struct conn_key *key)

> +{

> +    return l4_protos[key->nw_proto]->valid_new(pkt);

> +}

> +

> +static struct conn *

> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)

> +{

> +    struct conn *newconn;

> +

> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);

> +

> +    if (newconn) {

> +        newconn->key = *key;

> +    }

> +

> +    return newconn;

> +}

> +

> +static void

> +delete_conn(struct conn *conn)

> +{

> +    free(conn);

> +}

> diff --git a/lib/conntrack.h b/lib/conntrack.h

> new file mode 100644

> index 0000000..8561273

> --- /dev/null

> +++ b/lib/conntrack.h

> @@ -0,0 +1,144 @@

> +/*

> + * Copyright (c) 2015, 2016 Nicira, Inc.

> + *

> + * Licensed under the Apache License, Version 2.0 (the "License");

> + * you may not use this file except in compliance with the License.

> + * You may obtain a copy of the License at:

> + *

> + *     http://www.apache.org/licenses/LICENSE-2.0

> + *

> + * Unless required by applicable law or agreed to in writing, software

> + * distributed under the License is distributed on an "AS IS" BASIS,

> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

> implied.

> + * See the License for the specific language governing permissions and

> + * limitations under the License.

> + */

> +

> +#ifndef CONNTRACK_H

> +#define CONNTRACK_H 1

> +

> +#include <stdbool.h>

> +

> +#include "hmap.h"

> +#include "netdev-dpdk.h"

> +#include "odp-netlink.h"

> +#include "openvswitch/thread.h"

> +#include "openvswitch/types.h"

> +

> +

> +struct dp_packet;

> +

> +/* Userspace connection tracker

> + * ============================

> + *

> + * This is a connection tracking module that keeps all the state in userspace.

> + *

> + * Usage

> + * =====

> + *

> + *     struct conntract ct;

> + *

> + * Initialization:

> + *

> + *     conntrack_init(&ct);

> + *

> + * It is necessary to periodically issue a call to

> + *

> + *     conntrack_run(&ct);

> + *

> + * to allow the module to clean up expired connections.

> + *

> + * To send a group of packets through the connection tracker:

> + *

> + *     conntrack_execute(&ct, pkts, n_pkts, ...);

> + *

> + * Thread-safety

> + * =============

> + *

> + * conntrack_execute() can be called by multiple threads simultaneoulsy.

> + */

> +

> +struct conntrack;

> +

> +void conntrack_init(struct conntrack *);

> +void conntrack_run(struct conntrack *);

> +void conntrack_destroy(struct conntrack *);

> +

> +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,

> +                      bool commit, uint16_t zone, const uint32_t *setmark,

> +                      const struct ovs_key_ct_labels *setlabel,

> +                      const char *helper);

> +


> +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */

> +

> +#ifdef DPDK_NETDEV

> +struct OVS_LOCKABLE ct_lock {

> +    rte_spinlock_t lock;

> +};

> +

> +static inline void ct_lock_init(struct ct_lock *lock)

> +{

> +    rte_spinlock_init(&lock->lock);

> +}

> +

> +static inline void ct_lock_lock(struct ct_lock *lock)

> +    OVS_ACQUIRES(lock)

> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> +{

> +    rte_spinlock_lock(&lock->lock);

> +}

> +

> +static inline void ct_lock_unlock(struct ct_lock *lock)

> +    OVS_RELEASES(lock)

> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> +{

> +    rte_spinlock_unlock(&lock->lock);

> +}

> +

> +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)

> +{

> +}

> +#else

> +struct OVS_LOCKABLE ct_lock {

> +    struct ovs_mutex lock;

> +};

> +

> +static inline void ct_lock_init(struct ct_lock *lock)

> +{

> +    ovs_mutex_init(&lock->lock);

> +}

> +

> +static inline void ct_lock_lock(struct ct_lock *lock)

> +    OVS_ACQUIRES(lock)

> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> +{

> +    ovs_mutex_lock(&lock->lock);

> +}

> +

> +static inline void ct_lock_unlock(struct ct_lock *lock)

> +    OVS_RELEASES(lock)

> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> +{

> +    ovs_mutex_unlock(&lock->lock);

> +}

> +

> +static inline void ct_lock_destroy(struct ct_lock *lock)

> +{

> +    ovs_mutex_destroy(&lock->lock);

> +}

> +#endif

> +


> +#define CONNTRACK_BUCKETS_SHIFT 8

> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)

> +

> +struct conntrack {

> +    /* Each lock guards a 'connections' bucket */

> +    struct ct_lock locks[CONNTRACK_BUCKETS];

> +    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;

> +    uint32_t hash_basis;

> +    unsigned purge_bucket;

> +    uint32_t purge_inner_bucket;

> +    uint32_t purge_inner_offset;

> +};

> +

> +#endif /* conntrack.h */

> --

> 2.1.4

> 

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> http://openvswitch.org/mailman/listinfo/dev
Flavio Leitner April 26, 2016, 11:35 p.m. UTC | #3
On Fri, Apr 15, 2016 at 05:02:36PM -0700, Daniele Di Proietto wrote:
> This commit adds the conntrack module.
> 
> It is a connection tracker that resides entirely in userspace.  Its
> primary user will be the dpif-netdev datapath.
> 
> The module main goal is to provide conntrack_execute(), which offers a
> convenient interface to implement the datapath ct() action.
> 
> The conntrack module uses two submodules to deal with the l4 protocol
> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).
> 
> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
> subsystem, therefore it's BSD licensed.  It has been slightly altered to
> match the OVS coding style and to allow the pickup of already
> established connections.
> 
> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
> ---
>  COPYING                 |   1 +
>  debian/copyright.in     |   4 +
>  lib/automake.mk         |   5 +
>  lib/conntrack-other.c   |  91 ++++++
>  lib/conntrack-private.h |  77 +++++
>  lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++
>  lib/conntrack.c         | 851 ++++++++++++++++++++++++++++++++++++++++++++++++
>  lib/conntrack.h         | 144 ++++++++
>  8 files changed, 1649 insertions(+)
>  create mode 100644 lib/conntrack-other.c
>  create mode 100644 lib/conntrack-private.h
>  create mode 100644 lib/conntrack-tcp.c
>  create mode 100644 lib/conntrack.c
>  create mode 100644 lib/conntrack.h
> 
> diff --git a/COPYING b/COPYING
> index 308e3ea..afb98b9 100644
> --- a/COPYING
> +++ b/COPYING
> @@ -25,6 +25,7 @@ License, version 2.
>  The following files are licensed under the 2-clause BSD license.
>      include/windows/getopt.h
>      lib/getopt_long.c
> +    lib/conntrack-tcp.c
>  
>  The following files are licensed under the 3-clause BSD-license
>      include/windows/netinet/icmp6.h
> diff --git a/debian/copyright.in b/debian/copyright.in
> index 57d007a..a15f4dd 100644
> --- a/debian/copyright.in
> +++ b/debian/copyright.in
> @@ -21,6 +21,9 @@ Upstream Copyright Holders:
>  	Copyright (c) 2014 Michael Chapman
>  	Copyright (c) 2014 WindRiver, Inc.
>  	Copyright (c) 2014 Avaya, Inc.
> +	Copyright (c) 2001 Daniel Hartmeier
> +	Copyright (c) 2002 - 2008 Henning Brauer
> +	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
>  
>  License:
>  
> @@ -90,6 +93,7 @@ License:
>  	lib/getopt_long.c
>  	include/windows/getopt.h
>  	datapath-windows/ovsext/Conntrack-tcp.c
> +	lib/conntrack-tcp.c
>  
>  * The following files are licensed under the 3-clause BSD-license
>  
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 1ec2115..ba30442 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/compiler.h \
>  	lib/connectivity.c \
>  	lib/connectivity.h \
> +	lib/conntrack-private.h \
> +	lib/conntrack-tcp.c \
> +	lib/conntrack-other.c \
> +	lib/conntrack.c \
> +	lib/conntrack.h \
>  	lib/coverage.c \
>  	lib/coverage.h \
>  	lib/crc32c.c \
> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
> new file mode 100644
> index 0000000..65d02a9
> --- /dev/null
> +++ b/lib/conntrack-other.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "conntrack-private.h"
> +#include "dp-packet.h"
> +
> +enum other_state {
> +    OTHERS_FIRST,
> +    OTHERS_MULTIPLE,
> +    OTHERS_BIDIR,
> +};
> +
> +struct conn_other {
> +    struct conn up;
> +    enum other_state state;
> +};
> +
> +static const long long other_timeouts[] = {
> +    [OTHERS_FIRST] = 60 * 1000,
> +    [OTHERS_MULTIPLE] = 60 * 1000,
> +    [OTHERS_BIDIR] = 30 * 1000,
> +};

I missed a description here, like the unit.

> +
> +static struct conn_other *
> +conn_other_cast(const struct conn *conn)
> +{
> +    return CONTAINER_OF(conn, struct conn_other, up);
> +}
> +
> +static void
> +update_expiration(struct conn_other *conn, long long now)
> +{
> +    conn->up.expiration = now + other_timeouts[conn->state];
> +}
> +
> +static enum ct_update_res
> +other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
> +                bool reply, long long now)
> +{
> +    struct conn_other *conn = conn_other_cast(conn_);
> +
> +    if (reply && conn->state != OTHERS_BIDIR) {
> +        conn->state = OTHERS_BIDIR;
> +    } else if (conn->state == OTHERS_FIRST) {
> +        conn->state = OTHERS_MULTIPLE;
> +    }
> +
> +    update_expiration(conn, now);
> +
> +    return CT_UPDATE_VALID;
> +}
> +
> +static bool
> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)
> +{
> +    return true;
> +}
> +
> +static struct conn *
> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
> +{
> +    struct conn_other *conn;
> +
> +    conn = xzalloc(sizeof(struct conn_other));
> +    conn->state = OTHERS_FIRST;
> +
> +    update_expiration(conn, now);
> +
> +    return &conn->up;
> +}
> +
> +struct ct_l4_proto ct_proto_other = {
> +    .new_conn = other_new_conn,
> +    .valid_new = other_valid_new,
> +    .conn_update = other_conn_update,
> +};
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> new file mode 100644
> index 0000000..e668c44
> --- /dev/null
> +++ b/lib/conntrack-private.h
> @@ -0,0 +1,77 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef CONNTRACK_PRIVATE_H
> +#define CONNTRACK_PRIVATE_H 1
> +
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
> +
> +#include "hmap.h"
> +#include "openvswitch/types.h"
> +#include "packets.h"
> +#include "unaligned.h"
> +
> +struct ct_addr {
> +    union {
> +        ovs_16aligned_be32 ipv4;
> +        union ovs_16aligned_in6_addr ipv6;
> +        ovs_be32 ipv4_aligned;
> +        struct in6_addr ipv6_aligned;
> +    };
> +};
> +
> +struct ct_endpoint {
> +    struct ct_addr addr;
> +    ovs_be16 port;
> +};
> +
> +struct conn_key {
> +    struct ct_endpoint src;
> +    struct ct_endpoint dst;
> +
> +    ovs_be16 dl_type;
> +    uint8_t nw_proto;
> +    uint16_t zone;
> +};

Based on the above I presume we consider all IPs in the same space
regardless of the vlan, correct?


> +
> +struct conn {
> +    struct conn_key key;
> +    struct conn_key rev_key;
> +    long long expiration;
> +    struct hmap_node node;
> +    uint32_t mark;
> +    ovs_u128 label;
> +};
> +
> +enum ct_update_res {
> +    CT_UPDATE_INVALID,
> +    CT_UPDATE_VALID,
> +    CT_UPDATE_NEW,
> +};
> +
> +struct ct_l4_proto {
> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
> +    bool (*valid_new)(struct dp_packet *pkt);
> +    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
> +                                      bool reply, long long now);
> +};
> +
> +extern struct ct_l4_proto ct_proto_tcp;
> +extern struct ct_l4_proto ct_proto_other;
> +
> +#endif /* conntrack-private.h */
> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
> new file mode 100644
> index 0000000..4d80038
> --- /dev/null
> +++ b/lib/conntrack-tcp.c
> @@ -0,0 +1,476 @@
> +/*-
> + * Copyright (c) 2001 Daniel Hartmeier
> + * Copyright (c) 2002 - 2008 Henning Brauer
> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + *    - Redistributions of source code must retain the above copyright
> + *      notice, this list of conditions and the following disclaimer.
> + *    - Redistributions in binary form must reproduce the above
> + *      copyright notice, this list of conditions and the following
> + *      disclaimer in the documentation and/or other materials provided
> + *      with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + *
> + * Effort sponsored in part by the Defense Advanced Research Projects
> + * Agency (DARPA) and Air Force Research Laboratory, Air Force
> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.
> + *
> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
> + */
> +
> +#include <config.h>
> +
> +#include "conntrack-private.h"
> +#include "ct-dpif.h"
> +#include "dp-packet.h"
> +#include "util.h"
> +
> +struct tcp_peer {
> +    enum ct_dpif_tcp_state state;
> +    uint32_t               seqlo;          /* Max sequence number sent     */
> +    uint32_t               seqhi;          /* Max the other end ACKd + win */
> +    uint16_t               max_win;        /* largest window (pre scaling) */
> +    uint8_t                wscale;         /* window scaling factor        */
> +};
> +
> +struct conn_tcp {
> +    struct conn up;
> +    struct tcp_peer peer[2];
> +};
> +
> +enum {
> +    TCPOPT_EOL,
> +    TCPOPT_NOP,
> +    TCPOPT_WINDOW = 3,
> +};
> +
> +/* TCP sequence numbers are 32 bit integers operated
> + * on with modular arithmetic.  These macros can be
> + * used to compare such integers. */
> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)
> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)
> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)
> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)
> +
> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))
> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))

I am okay to leave those here, but since they are generic
operations, maybe we could define somewhere else and just
redefine here as SEQ_ operations.


> +
> +static struct conn_tcp*
> +conn_tcp_cast(const struct conn* conn)
> +{
> +    return CONTAINER_OF(conn, struct conn_tcp, up);
> +}
> +
> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
> + * is enabled.  We're not scrubbing, but this check seems reasonable.  */
> +static bool
> +tcp_invalid_flags(uint16_t flags)
> +{
> +
> +    if (flags & TCP_SYN) {
> +        if (flags & TCP_RST) {
> +            return true;
> +        }
> +        if (flags & TCP_FIN) {
> +            /* Here pf removes the fin flag.  We simply mark the packet as
> +             * invalid */

Maybe I missed something but it seems TH_FIN and TH_RST are
treated equally (drop).  I'd vote to remove the comment.

> +            return true;
> +        }
> +    } else {
> +        /* Illegal packet */
> +        if (!(flags & (TCP_ACK|TCP_RST))) {
> +            return true;
> +        }
> +    }
> +
> +    if (!(flags & TCP_ACK)) {
> +        /* These flags are only valid if ACK is set */
> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +#define TCP_MAX_WSCALE 14
> +#define CT_WSCALE_FLAG 0x80
> +#define CT_WSCALE_UNKNOWN 0x40
> +#define CT_WSCALE_MASK 0xf
> +
> +static uint8_t
> +tcp_get_wscale(const struct tcp_header *tcp)
> +{
> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);
> +    uint8_t wscale = 0;
> +    uint8_t optlen;
> +
> +    while (len >= 3) {
> +        if (*opt == TCPOPT_EOL) {
> +            break;
> +        }
> +        switch (*opt) {

Maybe we could do:

           case TCPOPT_EOL:
               break;
> +        case TCPOPT_NOP:
> +            opt++;
> +            len--;
> +            break;
> +        case TCPOPT_WINDOW:
> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);
> +            wscale |= CT_WSCALE_FLAG;
> +            /* fall through */
> +        default:
> +            optlen = opt[1];
> +            if (optlen < 2) {
> +                optlen = 2;
> +            }
> +            len -= optlen;
> +            opt += optlen;
> +        }
> +    }
> +
> +    return wscale;
> +}
> +
> +static uint32_t
> +tcp_payload_length(struct dp_packet *pkt)
> +{
> +    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
> +           - (char *) dp_packet_get_tcp_payload(pkt);
> +}
> +
> +static void
> +update_expiration(struct conn_tcp *conn, long long now, long long interval)
> +{
> +    conn->up.expiration = now + interval;
> +}
> +
> +
> +static enum ct_update_res
> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
> +                long long now)
> +{
> +    struct conn_tcp *conn = conn_tcp_cast(conn_);
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    /* The peer that sent 'pkt' */
> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
> +    /* The peer that should receive 'pkt' */
> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
> +    uint8_t sws = 0, dws = 0;
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    uint16_t win = ntohs(tcp->tcp_winsz);
> +    uint32_t ack, end, seq, orig_seq;
> +    uint32_t p_len = tcp_payload_length(pkt);
> +    int ackskew;
> +
> +    if (tcp_invalid_flags(tcp_flags)) {
> +        return CT_UPDATE_INVALID;
> +    }
> +
> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&
> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
> +        return CT_UPDATE_NEW;
> +    }
> +
> +    if (src->wscale & CT_WSCALE_FLAG
> +        && dst->wscale & CT_WSCALE_FLAG
> +        && !(tcp_flags & TCP_SYN)) {
> +
> +        sws = src->wscale & CT_WSCALE_MASK;
> +        dws = dst->wscale & CT_WSCALE_MASK;
> +
> +    } else if (src->wscale & CT_WSCALE_UNKNOWN
> +        && dst->wscale & CT_WSCALE_UNKNOWN
> +        && !(tcp_flags & TCP_SYN)) {
> +
> +        sws = TCP_MAX_WSCALE;
> +        dws = TCP_MAX_WSCALE;
> +    }
> +
> +    /*
> +     * Sequence tracking algorithm from Guido van Rooij's paper:
> +     *   http://www.madison-gurkha.com/publications/tcp_filtering/
> +     *      tcp_filtering.ps
> +     */
> +
> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
> +        /* First packet from this end. Set its state */
> +
> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
> +
> +        end = seq + p_len;
> +        if (tcp_flags & TCP_SYN) {
> +            end++;
> +            if (dst->wscale & CT_WSCALE_FLAG) {
> +                src->wscale = tcp_get_wscale(tcp);
> +                if (src->wscale & CT_WSCALE_FLAG) {
> +                    /* Remove scale factor from initial window */
> +                    sws = src->wscale & CT_WSCALE_MASK;
> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
> +                    dws = dst->wscale & CT_WSCALE_MASK;
> +                } else {
> +                    /* fixup other window */
> +                    dst->max_win <<= dst->wscale &
> +                        CT_WSCALE_MASK;
> +                    /* in case of a retrans SYN|ACK */
> +                    dst->wscale = 0;
> +                }
> +            }
> +        }
> +        if (tcp_flags & TCP_FIN) {
> +            end++;
> +        }
> +
> +        src->seqlo = seq;
> +        src->state = CT_DPIF_TCPS_SYN_SENT;
> +        /*
> +         * May need to slide the window (seqhi may have been set by
> +         * the crappy stack check or if we picked up the connection
> +         * after establishment)
> +         */
> +        if (src->seqhi == 1 ||
> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
> +            src->seqhi = end + MAX(1, dst->max_win << dws);
> +        }
> +        if (win > src->max_win) {
> +            src->max_win = win;
> +        }
> +
> +    } else {
> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
> +        end = seq + p_len;
> +        if (tcp_flags & TCP_SYN) {
> +            end++;
> +        }
> +        if (tcp_flags & TCP_FIN) {
> +            end++;
> +        }
> +    }
> +
> +    if ((tcp_flags & TCP_ACK) == 0) {
> +        /* Let it pass through the ack skew check */
> +        ack = dst->seqlo;
> +    } else if ((ack == 0
> +                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
> +               /* broken tcp stacks do not set ack */) {
> +        /* Many stacks (ours included) will set the ACK number in an
> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */
> +        ack = dst->seqlo;
> +    }
> +
> +    if (seq == end) {
> +        /* Ease sequencing restrictions on no data packets */
> +        seq = src->seqlo;
> +        end = seq;
> +    }
> +
> +    ackskew = dst->seqlo - ack;
> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
> +    if (SEQ_GEQ(src->seqhi, end)
> +        /* Last octet inside other's window space */
> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
> +        /* Retrans: not more than one window back */
> +        && (ackskew >= -MAXACKWINDOW)
> +        /* Acking not more than one reassembled fragment backwards */
> +        && (ackskew <= (MAXACKWINDOW << sws))
> +        /* Acking not more than one window forward */
> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
> +        /* Require an exact/+1 sequence match on resets when possible */
> +
> +        /* update max window */
> +        if (src->max_win < win) {
> +            src->max_win = win;
> +        }
> +        /* synchronize sequencing */
> +        if (SEQ_GT(end, src->seqlo)) {
> +            src->seqlo = end;
> +        }
> +        /* slide the window of what the other end can send */
> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
> +            dst->seqhi = ack + MAX((win << sws), 1);
> +        }
> +
> +        /* update states */
> +        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
> +                src->state = CT_DPIF_TCPS_SYN_SENT;
> +        }
> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
> +                src->state = CT_DPIF_TCPS_CLOSING;
> +        }
> +        if (tcp_flags & TCP_ACK) {
> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;
> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
> +            }
> +        }
> +        if (tcp_flags & TCP_RST) {
> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
> +        }
> +
> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
> +            update_expiration(conn, now, 30 * 1000);
> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING
> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
> +            update_expiration(conn, now, 45 * 1000);
> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
> +            update_expiration(conn, now, 30 * 1000);
> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING
> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
> +            update_expiration(conn, now, 15 * 60 * 1000);
> +        } else {
> +            update_expiration(conn, now, 24 * 60 * 60 * 1000);
> +        }

It would be nice to have defines for those timeout values.


> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
> +               /* Within a window forward of the originating packet */
> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
> +               /* Within a window backward of the originating packet */
> +
> +        /*
> +         * This currently handles three situations:
> +         *  1) Stupid stacks will shotgun SYNs before their peer
> +         *     replies.
> +         *  2) When PF catches an already established stream (the
> +         *     firewall rebooted, the state table was flushed, routes
> +         *     changed...)
> +         *  3) Packets get funky immediately after the connection
> +         *     closes (this should catch Solaris spurious ACK|FINs
> +         *     that web servers like to spew after a close)
> +         *
> +         * This must be a little more careful than the above code
> +         * since packet floods will also be caught here. We don't
> +         * update the TTL here to mitigate the damage of a packet
> +         * flood and so the same code can handle awkward establishment
> +         * and a loosened connection close.
> +         * In the establishment case, a correct peer response will
> +         * validate the connection, go through the normal state code
> +         * and keep updating the state TTL.
> +         */
> +
> +        /* update max window */
> +        if (src->max_win < win) {
> +            src->max_win = win;
> +        }
> +        /* synchronize sequencing */
> +        if (SEQ_GT(end, src->seqlo)) {
> +            src->seqlo = end;
> +        }
> +        /* slide the window of what the other end can send */
> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
> +            dst->seqhi = ack + MAX((win << sws), 1);
> +        }
> +
> +        /*
> +         * Cannot set dst->seqhi here since this could be a shotgunned
> +         * SYN and not an already established connection.
> +         */
> +
> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
> +            src->state = CT_DPIF_TCPS_CLOSING;
> +        }
> +
> +        if (tcp_flags & TCP_RST) {
> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
> +        }
> +    } else {
> +        return CT_UPDATE_INVALID;
> +    }
> +
> +    return CT_UPDATE_VALID;
> +}
> +
> +static bool
> +tcp_valid_new(struct dp_packet *pkt)
> +{
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    if (tcp_invalid_flags(tcp_flags)) {
> +        return false;
> +    }
> +
> +    /* A syn+ack is not allowed to create a connection.  We want to allow
> +     * totally new connections (syn) or already established, not partially
> +     * open (syn+ack). */
> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static struct conn *
> +tcp_new_conn(struct dp_packet *pkt, long long now)
> +{
> +    struct conn_tcp* newconn = NULL;
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    struct tcp_peer *src, *dst;
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    newconn = xzalloc(sizeof(struct conn_tcp));
> +
> +    src = &newconn->peer[0];
> +    dst = &newconn->peer[1];
> +
> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
> +
> +    if (tcp_flags & TCP_SYN) {
> +        src->seqhi++;
> +        src->wscale = tcp_get_wscale(tcp);
> +    } else {
> +        src->wscale = CT_WSCALE_UNKNOWN;
> +        dst->wscale = CT_WSCALE_UNKNOWN;
> +    }
> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
> +    if (src->wscale & CT_WSCALE_MASK) {
> +        /* Remove scale factor from initial window */
> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;
> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
> +    }
> +    if (tcp_flags & TCP_FIN) {
> +        src->seqhi++;
> +    }
> +    dst->seqhi = 1;
> +    dst->max_win = 1;
> +    src->state = CT_DPIF_TCPS_SYN_SENT;
> +    dst->state = CT_DPIF_TCPS_CLOSED;
> +
> +    update_expiration(newconn, now, 30 * 1000);
> +
> +    return &newconn->up;
> +}
> +
> +struct ct_l4_proto ct_proto_tcp = {
> +    .new_conn = tcp_new_conn,
> +    .valid_new = tcp_valid_new,
> +    .conn_update = tcp_conn_update,
> +};
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> new file mode 100644
> index 0000000..840335b
> --- /dev/null
> +++ b/lib/conntrack.c
> @@ -0,0 +1,851 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include "conntrack.h"
> +
> +#include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/icmp6.h>
> +
> +#include "bitmap.h"
> +#include "conntrack-private.h"
> +#include "dp-packet.h"
> +#include "flow.h"
> +#include "hmap.h"
> +#include "netdev.h"
> +#include "odp-netlink.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-rcu.h"
> +#include "random.h"
> +#include "timeval.h"
> +
> +VLOG_DEFINE_THIS_MODULE(conntrack);
> +
> +struct conn_lookup_ctx {
> +    struct conn_key key;
> +    struct conn *conn;
> +    uint32_t hash;
> +    bool reply;
> +    bool related;
> +};
> +
> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,
> +                             struct conn_lookup_ctx *, uint16_t zone);
> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
> +static void conn_key_reverse(struct conn_key *);
> +static void conn_key_lookup(struct conntrack *ct,
> +                            struct conn_lookup_ctx *ctx,
> +                            unsigned bucket,
> +                            long long now);
> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);
> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
> +                             long long now);
> +static void delete_conn(struct conn *);
> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
> +                                      bool reply, long long now);
> +static bool conn_expired(struct conn *, long long now);
> +static void set_mark(struct dp_packet *, struct conn *,
> +                     uint32_t val, uint32_t mask);
> +static void set_label(struct dp_packet *, struct conn *,
> +                      const struct ovs_key_ct_labels *val,
> +                      const struct ovs_key_ct_labels *mask);
> +
> +static struct ct_l4_proto *l4_protos[] = {
> +    [IPPROTO_TCP] = &ct_proto_tcp,
> +    [IPPROTO_UDP] = &ct_proto_other,
> +    [IPPROTO_ICMP] = &ct_proto_other,
> +};
> +
> +/* Initializes the connection tracker 'ct'.  The caller is responbile for
> + * calling 'conntrack_destroy()', when the instance is not needed anymore */
> +void
> +conntrack_init(struct conntrack *ct)
> +{
> +    unsigned i;
> +
> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
> +        ct_lock_init(&ct->locks[i]);
> +        ct_lock_lock(&ct->locks[i]);
> +        hmap_init(&ct->connections[i]);
> +        ct_lock_unlock(&ct->locks[i]);
> +    }
> +    ct->hash_basis = random_uint32();
> +    ct->purge_bucket = 0;
> +    ct->purge_inner_bucket = 0;
> +    ct->purge_inner_offset = 0;
> +}
> +
> +/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
> +void
> +conntrack_destroy(struct conntrack *ct)
> +{
> +    unsigned i;
> +
> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
> +        struct conn *conn, *next;
> +
> +        ct_lock_lock(&ct->locks[i]);
> +        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {
> +            hmap_remove(&ct->connections[i], &conn->node);
> +            delete_conn(conn);
> +        }
> +        hmap_destroy(&ct->connections[i]);
> +        ct_lock_unlock(&ct->locks[i]);
> +        ct_lock_destroy(&ct->locks[i]);
> +    }
> +}
> +
> +static unsigned hash_to_bucket(uint32_t hash)
> +{
> +    /* Extracts the most significant bits in hash. The least significant bits
> +     * are already used internally by the hmap implementation. */
> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
> +
> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
> +}
> +
> +static void
> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,
> +            uint32_t mark, ovs_u128 label)
> +{
> +    pkt->md.ct_state = state | CS_TRACKED;
> +    pkt->md.ct_zone = zone;
> +    pkt->md.ct_mark = mark;
> +    pkt->md.ct_label = label;
> +}
> +
> +static struct conn *
> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,
> +               long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *nc = NULL;
> +
> +    if (!valid_new(pkt, &ctx->key)) {
> +        *state |= CS_INVALID;
> +        return nc;
> +    }
> +
> +    *state |= CS_NEW;
> +
> +    if (commit) {
> +        nc = new_conn(pkt, &ctx->key, now);
> +
> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
> +
> +        conn_key_reverse(&nc->rev_key);
> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);
> +    }
> +
> +    return nc;
> +}
> +
> +static struct conn *
> +process_one(struct conntrack *ct, struct dp_packet *pkt,
> +            struct conn_lookup_ctx *ctx, uint16_t zone,
> +            bool commit, long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *conn = ctx->conn;
> +    uint8_t state = 0;
> +
> +    if (conn) {
> +        if (ctx->related) {
> +            state |= CS_RELATED;
> +            if (ctx->reply) {
> +                state |= CS_REPLY_DIR;
> +            }
> +        } else {
> +            enum ct_update_res res;
> +
> +            res = conn_update(conn, pkt, ctx->reply, now);
> +
> +            switch (res) {
> +            case CT_UPDATE_VALID:
> +                state |= CS_ESTABLISHED;
> +                if (ctx->reply) {
> +                    state |= CS_REPLY_DIR;
> +                }
> +                break;
> +            case CT_UPDATE_INVALID:
> +                state |= CS_INVALID;
> +                break;
> +            case CT_UPDATE_NEW:
> +                hmap_remove(&ct->connections[bucket], &conn->node);
> +                delete_conn(conn);
> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> +                break;
> +            }
> +        }
> +
> +        pkt->md.ct_label = conn->label;
> +        pkt->md.ct_mark = conn->mark;
> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);
> +    } else {
> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});
> +    }
> +
> +    return conn;
> +}
> +
> +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker
> + * 'ct'.  If 'commit' is true, the packets are allowed to create new entries
> + * in the connection tables.  'setmark', if not NULL, should point to a two
> + * elements array containing a value and a mask to set the connection mark.
> + * 'setlabel' behaves similarly for the connection label.*/
> +int
> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,
> +                  bool commit, uint16_t zone, const uint32_t *setmark,
> +                  const struct ovs_key_ct_labels *setlabel,
> +                  const char *helper)
> +{
> +#if !defined(__CHECKER__) && !defined(_WIN32)
> +    const size_t KEY_ARRAY_SIZE = cnt;
> +#else
> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
> +#endif
> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
> +    int8_t bucket_list[CONNTRACK_BUCKETS];
> +    struct {
> +        unsigned bucket;
> +        unsigned long maps;
> +    } arr[KEY_ARRAY_SIZE];
> +    long long now = time_msec();
> +    size_t i = 0;
> +    uint8_t arrcnt = 0;
> +
> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
> +
> +    if (helper) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
> +
> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
> +        /* Continue without the helper */
> +    }
> +
> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);
> +    for (i = 0; i < cnt; i++) {
> +        unsigned bucket;
> +
> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
> +            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});
> +            continue;
> +        }
> +
> +        bucket = hash_to_bucket(ctxs[i].hash);
> +        if (bucket_list[bucket] == INT8_C(-1)) {
> +            bucket_list[bucket] = arrcnt;
> +
> +            arr[arrcnt].maps = 0;
> +            ULLONG_SET1(arr[arrcnt].maps, i);
> +            arr[arrcnt++].bucket = bucket;
> +        } else {
> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
> +            arr[bucket_list[bucket]].maps |= 1UL << i;
> +        }
> +    }
> +
> +    for (i = 0; i < arrcnt; i++) {
> +        size_t j;
> +
> +        ct_lock_lock(&ct->locks[arr[i].bucket]);
> +
> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {
> +            struct conn *conn;
> +
> +            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);
> +
> +            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
> +
> +            if (conn && setmark) {
> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);
> +            }
> +
> +            if (conn && setlabel) {
> +                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
> +            }
> +        }
> +        ct_lock_unlock(&ct->locks[arr[i].bucket]);
> +    }
> +
> +    return 0;
> +}
> +
> +static void
> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
> +{
> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
> +    conn->mark = pkt->md.ct_mark;
> +}
> +
> +static void
> +set_label(struct dp_packet *pkt, struct conn *conn,
> +          const struct ovs_key_ct_labels *val,
> +          const struct ovs_key_ct_labels *mask)
> +{
> +    ovs_u128 v, m;
> +
> +    memcpy(&v, val, sizeof v);
> +    memcpy(&m, mask, sizeof m);
> +
> +    pkt->md.ct_label.u64.lo = v.u64.lo
> +                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
> +    pkt->md.ct_label.u64.hi = v.u64.hi
> +                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
> +    conn->label = pkt->md.ct_label;
> +}
> +
> +#define CONNTRACK_PURGE_NUM 256
> +
> +static void
> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,
> +             uint32_t *inner_offset, unsigned *left, long long now)
> +{
> +    while (*left != 0) {
> +        struct hmap_node *node;
> +        struct conn *conn;
> +
> +        node = hmap_at_position(bucket, inner_bucket, inner_offset);
> +
> +        if (!node) {
> +            hmap_shrink(bucket);
> +            break;
> +        }
> +
> +        INIT_CONTAINER(conn, node, node);
> +        if (conn_expired(conn, now)) {
> +            hmap_remove(bucket, &conn->node);
> +            delete_conn(conn);
> +            (*left)--;
> +        }
> +    }
> +}
> +
> +/* Cleans up old connection entries.  Should be called periodically. */
> +void
> +conntrack_run(struct conntrack *ct)
> +{
> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);
> +    uint32_t inner_bucket = ct->purge_inner_bucket,
> +             inner_offset = ct->purge_inner_offset;
> +    unsigned left = CONNTRACK_PURGE_NUM;
> +    long long now = time_msec();
> +
> +    while (bucket < CONNTRACK_BUCKETS) {
> +        ct_lock_lock(&ct->locks[bucket]);
> +        sweep_bucket(&ct->connections[bucket],
> +                     &inner_bucket, &inner_offset,
> +                     &left, now);
> +        ct_lock_unlock(&ct->locks[bucket]);
> +
> +        if (left == 0) {
> +            break;
> +        } else {
> +            bucket++;
> +        }
> +    }
> +
> +    ct->purge_bucket = bucket;
> +    ct->purge_inner_bucket = inner_bucket;
> +    ct->purge_inner_offset = inner_offset;
> +}
> +
> +/* Key extraction */
> +
> +/* The function stores a pointer to the first byte after the header in
> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
> + * not interested in the header's tail,  meaning that the header has
> + * already been parsed (e.g. by flow_extract): we take this as a hint to
> + * save a few checks. */
> +static inline bool
> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
> +                const char **new_data)
> +{
> +    const struct ip_header *ip = data;
> +
> +    if (new_data) {
> +        size_t ip_len;
> +
> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
> +            return false;
> +        }
> +        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
> +
> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
> +            return false;
> +        }
> +        if (OVS_UNLIKELY(size < ip_len)) {
> +            return false;
> +        }
> +
> +        *new_data = (char *) data + ip_len;
> +    }
> +
> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
> +        return false;
> +    }
> +
> +    key->src.addr.ipv4 = ip->ip_src;
> +    key->dst.addr.ipv4 = ip->ip_dst;
> +    key->nw_proto = ip->ip_proto;
> +
> +    return true;
> +}
> +
> +/* The function stores a pointer to the first byte after the header in
> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
> + * not interested in the header's tail,  meaning that the header has
> + * already been parsed (e.g. by flow_extract): we take this as a hint to
> + * save a few checks. */
> +static inline bool
> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
> +                const char **new_data)
> +{
> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;
> +    uint8_t nw_proto = ip6->ip6_nxt;
> +    uint8_t nw_frag = 0;
> +
> +    if (new_data) {
> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {
> +            return false;
> +        }
> +    }
> +
> +    data = ip6 + 1;
> +    size -=  sizeof *ip6;
> +
> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
> +        return false;
> +    }
> +
> +    if (new_data) {
> +        *new_data = data;
> +    }
> +
> +    if (nw_frag) {
> +        return false;
> +    }
> +
> +    key->src.addr.ipv6 = ip6->ip6_src;
> +    key->dst.addr.ipv6 = ip6->ip6_dst;
> +    key->nw_proto = nw_proto;
> +
> +    return true;
> +}
> +
> +static inline bool
> +check_l4_tcp(const void *data, size_t size)
> +{
> +    const struct tcp_header *tcp = data;
> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
> +
> +    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
> +static inline bool
> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
> +{
> +    const struct tcp_header *tcp = data;
> +
> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    key->src.port = tcp->tcp_src;
> +    key->dst.port = tcp->tcp_dst;
> +
> +    return true;
> +}
> +
> +static inline bool
> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)
> +{
> +    const struct udp_header *udp = data;
> +
> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    key->src.port = udp->udp_src;
> +    key->dst.port = udp->udp_dst;
> +
> +    return true;
> +}
> +
> +static inline bool extract_l4(struct conn_key *key, const void *data,
> +                              size_t size, bool *related);
> +
> +/* If 'related' is not NULL and the function is processing an ICMP
> + * error packet, extract the l3 and l4 fields from the nested header
> + * instead and set *related to true.  If 'related' is NULL we're
> + * already processing a nested header and no such recursion is
> + * possible */
> +static inline int
> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
> +                bool *related)
> +{
> +    const struct icmp_header *icmp = data;
> +
> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    switch (icmp->icmp_type) {
> +    case ICMP4_ECHO_REQUEST:
> +    case ICMP4_ECHO_REPLY:
> +    case ICMP4_TIMESTAMP:
> +    case ICMP4_TIMESTAMPREPLY:
> +    case ICMP4_INFOREQUEST:
> +    case ICMP4_INFOREPLY:
> +        /* Separate ICMP connection: identified using id */
> +        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
> +        break;
> +    case ICMP4_DST_UNREACH:
> +    case ICMP4_TIME_EXCEEDED:
> +    case ICMP4_PARAM_PROB:
> +    case ICMP4_SOURCEQUENCH:
> +    case ICMP4_REDIRECT: {
> +        /* ICMP packet part of another connection. We should
> +         * extract the key from embedded packet header */
> +        struct conn_key inner_key;
> +        const char *l3 = (const char *) (icmp + 1);
> +        const char *tail = (const char *) data + size;
> +        const char *l4;
> +        bool ok;
> +
> +        if (!related) {
> +            return false;
> +        }
> +        *related = true;
> +
> +        memset(&inner_key, 0, sizeof inner_key);
> +        inner_key.dl_type = htons(ETH_TYPE_IP);
> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);
> +        if (!ok) {
> +            return false;
> +        }
> +
> +        /* pf doesn't do this, but it seems a good idea */
> +        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
> +            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
> +            return false;
> +        }
> +
> +        key->src = inner_key.src;
> +        key->dst = inner_key.dst;
> +        key->nw_proto = inner_key.nw_proto;
> +
> +        ok = extract_l4(key, l4, tail - l4, NULL);
> +        if (ok) {
> +            conn_key_reverse(key);
> +        }
> +        return ok;
> +    }
> +    default:
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/* If 'related' is not NULL and the function is processing an ICMP
> + * error packet, extract the l3 and l4 fields from the nested header
> + * instead and set *related to true.  If 'related' is NULL we're
> + * already processing a nested header and no such recursion is
> + * possible */
> +static inline bool
> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
> +                 bool *related)
> +{
> +    const struct icmp6_header *icmp6 = data;
> +
> +    /* All the messages that we support need at least 4 bytes after
> +     * the header */
> +    if (size < sizeof *icmp6 + 4) {
> +        return false;
> +    }
> +
> +    switch (icmp6->icmp6_type) {
> +    case ICMP6_ECHO_REQUEST:
> +    case ICMP6_ECHO_REPLY:
> +        /* Separate ICMP connection: identified using id */
> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
> +        break;
> +    case ICMP6_DST_UNREACH:
> +    case ICMP6_PACKET_TOO_BIG:
> +    case ICMP6_TIME_EXCEEDED:
> +    case ICMP6_PARAM_PROB:{
> +        /* ICMP packet part of another connection. We should
> +         * extract the key from embedded packet header */
> +        struct conn_key inner_key;
> +        const char *l3 = (const char *) icmp6 + 8;
> +        const char *tail = (const char *) data + size;
> +        const char *l4 = NULL;
> +        bool ok;
> +
> +        if (!related) {
> +            return false;
> +        }
> +        *related = true;
> +
> +        memset(&inner_key, 0, sizeof inner_key);
> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);
> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
> +        if (!ok) {
> +            return false;
> +        }
> +
> +        /* pf doesn't do this, but it seems a good idea */
> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
> +                              &key->dst.addr.ipv6_aligned)
> +            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
> +                                 &key->src.addr.ipv6_aligned)) {
> +            return false;
> +        }
> +
> +        key->src = inner_key.src;
> +        key->dst = inner_key.dst;
> +        key->nw_proto = inner_key.nw_proto;
> +
> +        ok = extract_l4(key, l4, tail - l4, NULL);
> +        if (ok) {
> +            conn_key_reverse(key);
> +        }
> +        return ok;
> +    }
> +    default:
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/* Extract l4 fields into 'key', which must already contain valid l3
> + * members.  If 'related' is not NULL and an ICMP error packet is being
> + * processed, the function will extract the key from the packet nested
> + * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,
> + * nested parsing isn't allowed.  This is necessary to limit the
> + * recursion level. */
> +static inline bool
> +extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)
> +{
> +    if (key->nw_proto == IPPROTO_TCP) {
> +        return extract_l4_tcp(key, data, size)
> +               && (!related || check_l4_tcp(data, size));
> +    } else if (key->nw_proto == IPPROTO_UDP) {
> +        return extract_l4_udp(key, data, size);
> +    } else if (key->dl_type == htons(ETH_TYPE_IP)
> +               && key->nw_proto == IPPROTO_ICMP) {
> +        return extract_l4_icmp(key, data, size, related);
> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
> +               && key->nw_proto == IPPROTO_ICMPV6) {
> +        return extract_l4_icmp6(key, data, size, related);
> +    } else {
> +        return false;
> +    }
> +}
> +
> +static bool
> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
> +                 struct conn_lookup_ctx *ctx, uint16_t zone)
> +{
> +    const struct eth_header *l2 = dp_packet_l2(pkt);
> +    const struct ip_header *l3 = dp_packet_l3(pkt);
> +    const char *l4 = dp_packet_l4(pkt);
> +    const char *tail = dp_packet_tail(pkt);
> +    bool ok;
> +
> +    memset(ctx, 0, sizeof *ctx);
> +
> +    if (!l2 || !l3 || !l4) {
> +        return false;
> +    }
> +
> +    ctx->key.zone = zone;
> +
> +    /* XXX In this function we parse the packet (again, it has already
> +     * gone through miniflow_extract()) for two reasons:
> +     *
> +     * 1) To extract the l3 addresses and l4 ports.
> +     *    We already have the l3 and l4 headers' pointers.  Extracting
> +     *    the l3 addresses and the l4 ports is really cheap, since they
> +     *    can be found at fixed locations.
> +     * 2) To extract the l3 and l4 types.
> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the
> +     *    other hand is quite expensive, because they're not at a
> +     *    fixed location.
> +     *
> +     * Here's a way to avoid (2) with the help of the datapath.
> +     * The datapath doesn't keep the packet's extracted flow[2], so
> +     * using that is not an option.  We could use the packet's matching
> +     * megaflow, but we have to make sure that the l3 and l4 types
> +     * are unwildcarded.  This means either:
> +     *
> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow
> +     *    is installed if the actions contains ct().  This is what the
> +     *    kernel datapath does.  It is not so straightforward, though.
> +     *
> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when
> +     *    translating a ct() action.  This is already done in different
> +     *    actions and since both the userspace and the kernel datapath
> +     *    would benefit from it, it seems an appropriate place to do
> +     *    it.
> +     *
> +     * ---
> +     * [1] A simple benchmark (running only the connection tracker
> +     *     over and over on the same packets) shows that if the
> +     *     l3 type is already provided we are 15% faster (running the
> +     *     connection tracker over a couple of DPDK devices with a
> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).
> +     *
> +     * [2] The reasons for this are that keeping the flow increases
> +     *     (slightly) the cache footprint and increases computation
> +     *     time as we move the packet around. Most importantly, the flow
> +     *     should be updated by the actions and this can be slow, as
> +     *     we use a sparse representation (miniflow).
> +     *
> +     */
> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);
> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
> +    } else {
> +        ok = false;
> +    }
> +
> +    if (ok) {
> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {
> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +/* Symmetric */
> +static uint32_t
> +conn_key_hash(const struct conn_key *key, uint32_t basis)
> +{
> +    uint32_t hsrc, hdst, hash;
> +    int i;
> +
> +    hsrc = hdst = basis;
> +
> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
> +    }
> +
> +    hash = hsrc ^ hdst;
> +
> +    hash = hash_words((uint32_t *) &key->dst + 1,
> +                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
> +                      hash);
> +
> +    return hash;
> +}
> +
> +static void
> +conn_key_reverse(struct conn_key *key)
> +{
> +    struct ct_endpoint tmp;
> +    tmp = key->src;
> +    key->src = key->dst;
> +    key->dst = tmp;
> +}
> +
> +static void
> +conn_key_lookup(struct conntrack *ct,
> +                struct conn_lookup_ctx *ctx,
> +                unsigned bucket,
> +                long long now)
> +{
> +    struct conn *conn, *found = NULL;
> +    uint32_t hash = ctx->hash;
> +    bool reply;
> +
> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) {
> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {
> +            found = conn;
> +            reply = false;
> +            break;
> +        }
> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {
> +            found = conn;
> +            reply = true;
> +            break;
> +        }
> +    }
> +
> +    if (found) {
> +        if (conn_expired(found, now)) {
> +            found = NULL;
> +        } else {
> +            ctx->reply = reply;
> +        }
> +    }
> +
> +    ctx->conn = found;
> +}
> +
> +static enum ct_update_res
> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
> +            long long now)
> +{
> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);
> +}
> +
> +static bool
> +conn_expired(struct conn *conn, long long now)
> +{
> +    return now >= conn->expiration;
> +}
> +
> +static bool
> +valid_new(struct dp_packet *pkt, struct conn_key *key)
> +{
> +    return l4_protos[key->nw_proto]->valid_new(pkt);
> +}
> +
> +static struct conn *
> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
> +{
> +    struct conn *newconn;
> +
> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
> +
> +    if (newconn) {
> +        newconn->key = *key;
> +    }
> +
> +    return newconn;
> +}
> +
> +static void
> +delete_conn(struct conn *conn)
> +{
> +    free(conn);
> +}
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> new file mode 100644
> index 0000000..8561273
> --- /dev/null
> +++ b/lib/conntrack.h
> @@ -0,0 +1,144 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef CONNTRACK_H
> +#define CONNTRACK_H 1
> +
> +#include <stdbool.h>
> +
> +#include "hmap.h"
> +#include "netdev-dpdk.h"
> +#include "odp-netlink.h"
> +#include "openvswitch/thread.h"
> +#include "openvswitch/types.h"
> +
> +
> +struct dp_packet;
> +
> +/* Userspace connection tracker
> + * ============================
> + *
> + * This is a connection tracking module that keeps all the state in userspace.
> + *
> + * Usage
> + * =====
> + *
> + *     struct conntract ct;
> + *
> + * Initialization:
> + *
> + *     conntrack_init(&ct);
> + *
> + * It is necessary to periodically issue a call to
> + *
> + *     conntrack_run(&ct);
> + *
> + * to allow the module to clean up expired connections.
> + *
> + * To send a group of packets through the connection tracker:
> + *
> + *     conntrack_execute(&ct, pkts, n_pkts, ...);
> + *
> + * Thread-safety
> + * =============
> + *
> + * conntrack_execute() can be called by multiple threads simultaneoulsy.
> + */
> +
> +struct conntrack;
> +
> +void conntrack_init(struct conntrack *);
> +void conntrack_run(struct conntrack *);
> +void conntrack_destroy(struct conntrack *);
> +
> +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,
> +                      bool commit, uint16_t zone, const uint32_t *setmark,
> +                      const struct ovs_key_ct_labels *setlabel,
> +                      const char *helper);
> +
> +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */
> +
> +#ifdef DPDK_NETDEV
> +struct OVS_LOCKABLE ct_lock {
> +    rte_spinlock_t lock;
> +};
> +
> +static inline void ct_lock_init(struct ct_lock *lock)
> +{
> +    rte_spinlock_init(&lock->lock);
> +}
> +
> +static inline void ct_lock_lock(struct ct_lock *lock) 

There is an extra space above at the end.

> +    OVS_ACQUIRES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    rte_spinlock_lock(&lock->lock);
> +}
> +
> +static inline void ct_lock_unlock(struct ct_lock *lock)
> +    OVS_RELEASES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    rte_spinlock_unlock(&lock->lock);
> +}
> +
> +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)
> +{
> +}
> +#else
> +struct OVS_LOCKABLE ct_lock {
> +    struct ovs_mutex lock;
> +};
> +
> +static inline void ct_lock_init(struct ct_lock *lock)
> +{
> +    ovs_mutex_init(&lock->lock);
> +}
> +
> +static inline void ct_lock_lock(struct ct_lock *lock)
> +    OVS_ACQUIRES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    ovs_mutex_lock(&lock->lock);
> +}
> +
> +static inline void ct_lock_unlock(struct ct_lock *lock)
> +    OVS_RELEASES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    ovs_mutex_unlock(&lock->lock);
> +}
> +
> +static inline void ct_lock_destroy(struct ct_lock *lock)
> +{
> +    ovs_mutex_destroy(&lock->lock);
> +}
> +#endif
> +
> +#define CONNTRACK_BUCKETS_SHIFT 8
> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
> +
> +struct conntrack {
> +    /* Each lock guards a 'connections' bucket */
> +    struct ct_lock locks[CONNTRACK_BUCKETS];
> +    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;
> +    uint32_t hash_basis;
> +    unsigned purge_bucket;
> +    uint32_t purge_inner_bucket;
> +    uint32_t purge_inner_offset;
> +};
> +
> +#endif /* conntrack.h */
> -- 
> 2.1.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
Daniele Di Proietto April 27, 2016, 6:36 a.m. UTC | #4
Thanks for the detailed review Joe, replies inline



On 19/04/2016 14:36, "Joe Stringer" <joe@ovn.org> wrote:

>On 15 April 2016 at 17:02, Daniele Di Proietto <diproiettod@vmware.com> wrote:

>> This commit adds the conntrack module.

>>

>> It is a connection tracker that resides entirely in userspace.  Its

>> primary user will be the dpif-netdev datapath.

>>

>> The module main goal is to provide conntrack_execute(), which offers a

>> convenient interface to implement the datapath ct() action.

>>

>> The conntrack module uses two submodules to deal with the l4 protocol

>> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

>>

>> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf

>> subsystem, therefore it's BSD licensed.  It has been slightly altered to

>> match the OVS coding style and to allow the pickup of already

>> established connections.

>>

>> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

>

>Thanks for submitting this, I know there's a few people excited about

>this feature.

>

>I notice that there is no NEWS item about this, were you intending to

>hold off on announcing it until there is feature parity with the

>kernel, eg support for IPv[46] fragments; ALGs; NAT?


Actually I didn't think about this :-)

I'll add a line in NEWS

>

>

>> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c

>> new file mode 100644

>> index 0000000..65d02a9

>> --- /dev/null

>> +++ b/lib/conntrack-other.c

>> @@ -0,0 +1,91 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#include <config.h>

>> +

>> +#include "conntrack-private.h"

>> +#include "dp-packet.h"

>> +

>> +enum other_state {

>> +    OTHERS_FIRST,

>> +    OTHERS_MULTIPLE,

>> +    OTHERS_BIDIR,

>> +};

>> +

>> +struct conn_other {

>> +    struct conn up;

>> +    enum other_state state;

>> +};

>> +

>> +static const long long other_timeouts[] = {

>> +    [OTHERS_FIRST] = 60 * 1000,

>> +    [OTHERS_MULTIPLE] = 60 * 1000,

>> +    [OTHERS_BIDIR] = 30 * 1000,

>> +};

>

>Are these timeouts in milliseconds? (a comment or #define might help with that)


Sure, I'll add a comment.

>

><snip>

>

>> diff --git a/lib/conntrack.c b/lib/conntrack.c

>> new file mode 100644

>> index 0000000..840335b

>> --- /dev/null

>> +++ b/lib/conntrack.c

>...

>> +static struct ct_l4_proto *l4_protos[] = {

>> +    [IPPROTO_TCP] = &ct_proto_tcp,

>> +    [IPPROTO_UDP] = &ct_proto_other,

>> +    [IPPROTO_ICMP] = &ct_proto_other,

>> +};

>

>I see that conntrack-other is shared by UDP and ICMP. In the Linux

>datapath, the UDP handler also checks UDP length and UDP checksum - I

>wonder if we can share most of the code here for these protocols but

>add these additional checks for the UDP case?


Thanks for pointing this out!

I think it's a good idea to check the UDP length, I've added a check later.

This didn't handle at all checksum verification.  I thought it was out of
scope for the connection tracker, but since the kernel connection tracker
does this, I've added the validation.

None of this need (yet) a separate conntrack-icmp module, I'm implementing
it when the key is being extracted (as you point out later).

We would need a separate conntrack-icmp module if we wanted to track icmp
type and code.

>

>> +static struct conn *

>> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

>> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,

>> +               long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *nc = NULL;

>> +

>> +    if (!valid_new(pkt, &ctx->key)) {

>> +        *state |= CS_INVALID;

>> +        return nc;

>> +    }

>> +

>> +    *state |= CS_NEW;

>> +

>> +    if (commit) {

>> +        nc = new_conn(pkt, &ctx->key, now);

>> +

>> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);

>> +

>> +        conn_key_reverse(&nc->rev_key);

>> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);

>> +    }

>> +

>> +    return nc;

>> +}

>

>This function can return NULL....

>

>> +static struct conn *

>> +process_one(struct conntrack *ct, struct dp_packet *pkt,

>> +            struct conn_lookup_ctx *ctx, uint16_t zone,

>> +            bool commit, long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *conn = ctx->conn;

>> +    uint8_t state = 0;

>> +

>> +    if (conn) {

>> +        if (ctx->related) {

>> +            state |= CS_RELATED;

>> +            if (ctx->reply) {

>> +                state |= CS_REPLY_DIR;

>> +            }

>> +        } else {

>> +            enum ct_update_res res;

>> +

>> +            res = conn_update(conn, pkt, ctx->reply, now);

>> +

>> +            switch (res) {

>> +            case CT_UPDATE_VALID:

>> +                state |= CS_ESTABLISHED;

>> +                if (ctx->reply) {

>> +                    state |= CS_REPLY_DIR;

>> +                }

>> +                break;

>> +            case CT_UPDATE_INVALID:

>> +                state |= CS_INVALID;

>> +                break;

>> +            case CT_UPDATE_NEW:

>> +                hmap_remove(&ct->connections[bucket], &conn->node);

>> +                delete_conn(conn);

>> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>

>...which is then stored here...

>

>> +                break;

>> +            }

>> +        }

>> +

>> +        pkt->md.ct_label = conn->label;

>> +        pkt->md.ct_mark = conn->mark;

>> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

>

>...and dereferenced here. Is it possible to send invalid packets for

>an existing connection and cause a NULL pointer dereference?


Yes, thanks for spotting this. I've added a NULL pointer check and
moved this out of the if branch (it's needed also in the else branch)

>

>

>> +    } else {

>> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

>

>Maybe we can define OVS_U128_MIN (ovs_u128) {{0}} in a header somewhere?


Ok.  I've put OVS_U128_MIN in util.h (like OVS_U128_MAX) and defined an
OVS_U128_ZERO macro as an alias.

>

>

>> +/* Cleans up old connection entries.  Should be called periodically. */

>> +void

>> +conntrack_run(struct conntrack *ct)

>> +{

>> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);

>> +    uint32_t inner_bucket = ct->purge_inner_bucket,

>> +             inner_offset = ct->purge_inner_offset;

>

>Style seems a little unusual for the above 'inner_offset'.


Right, I put the variable in two separate statements.

>

>

>> +static inline bool

>> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)

>> +{

>> +    const struct udp_header *udp = data;

>> +

>> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {

>> +        return false;

>> +    }

>

>Maybe my comment earlier about UDP len and checksum could be addressed here?


Yes, thank you!

>

>...

>

>

>> +static bool

>> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,

>> +                 struct conn_lookup_ctx *ctx, uint16_t zone)

>> +{

>> +    const struct eth_header *l2 = dp_packet_l2(pkt);

>> +    const struct ip_header *l3 = dp_packet_l3(pkt);

>> +    const char *l4 = dp_packet_l4(pkt);

>> +    const char *tail = dp_packet_tail(pkt);

>> +    bool ok;

>> +

>> +    memset(ctx, 0, sizeof *ctx);

>> +

>> +    if (!l2 || !l3 || !l4) {

>> +        return false;

>> +    }

>> +

>> +    ctx->key.zone = zone;

>> +

>> +    /* XXX In this function we parse the packet (again, it has already

>> +     * gone through miniflow_extract()) for two reasons:

>> +     *

>> +     * 1) To extract the l3 addresses and l4 ports.

>> +     *    We already have the l3 and l4 headers' pointers.  Extracting

>> +     *    the l3 addresses and the l4 ports is really cheap, since they

>> +     *    can be found at fixed locations.

>> +     * 2) To extract the l3 and l4 types.

>> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the

>> +     *    other hand is quite expensive, because they're not at a

>> +     *    fixed location.

>> +     *

>> +     * Here's a way to avoid (2) with the help of the datapath.

>> +     * The datapath doesn't keep the packet's extracted flow[2], so

>> +     * using that is not an option.  We could use the packet's matching

>> +     * megaflow, but we have to make sure that the l3 and l4 types

>> +     * are unwildcarded.  This means either:

>> +     *

>> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow

>> +     *    is installed if the actions contains ct().  This is what the

>> +     *    kernel datapath does.  It is not so straightforward, though.

>> +     *

>> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when

>> +     *    translating a ct() action.  This is already done in different

>> +     *    actions and since both the userspace and the kernel datapath

>> +     *    would benefit from it, it seems an appropriate place to do

>> +     *    it.

>> +     *

>> +     * ---

>> +     * [1] A simple benchmark (running only the connection tracker

>> +     *     over and over on the same packets) shows that if the

>> +     *     l3 type is already provided we are 15% faster (running the

>> +     *     connection tracker over a couple of DPDK devices with a

>> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).

>> +     *

>> +     * [2] The reasons for this are that keeping the flow increases

>> +     *     (slightly) the cache footprint and increases computation

>> +     *     time as we move the packet around. Most importantly, the flow

>> +     *     should be updated by the actions and this can be slow, as

>> +     *     we use a sparse representation (miniflow).

>> +     *

>> +     */

>

>I discussed the above offline with Jarno; we will always unwildcard

>the l3 -- see xlate_wc_init().


Oh, you're right, I forgot about that, I guess I need at least to update
the comment.

>

>Do we have a benchmark on how costly the options are for figuring out

>the l4 type?


It's a problem only for IPv6, as we need to parse the extension headers.

None of the solution I thought about seem particularly clean.  I'd be inclined
to keep this here and revisit it later.

>

>

>> +static void

>> +conn_key_lookup(struct conntrack *ct,

>> +                struct conn_lookup_ctx *ctx,

>> +                unsigned bucket,

>> +                long long now)

>> +{

>> +    struct conn *conn, *found = NULL;

>> +    uint32_t hash = ctx->hash;

>> +    bool reply;

>> +

>> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) {

>> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {

>> +            found = conn;

>> +            reply = false;

>> +            break;

>> +        }

>> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {

>> +            found = conn;

>> +            reply = true;

>> +            break;

>> +        }

>> +    }

>> +

>> +    if (found) {

>> +        if (conn_expired(found, now)) {

>> +            found = NULL;

>> +        } else {

>> +            ctx->reply = reply;

>> +        }

>> +    }

>

>Should this check be within the HMAP_FOR_EACH_WITH_HASH?


Yes, that also simplify the code a little bit, thank you.

>

>

>Separate question, and maybe something to be followed up on separately

>from this series, today with the current userspace + kernel

>implementation we can install a flow like:

>

>arp,actions=ct(table=1)

>

>At OpenFlow layer we'll accept this (although maybe we shouldn't)...

>When processing an upcall, we'll translate this (though we have enough

>information to at least complain if not more)

>Then when installing/executing into the kernel datapath, we'll reject

>the command while parsing the action.

>

>I'm considering that maybe we should either a) Restrict this all the

>way up at OpenFlow, or b) loosen the restriction in kernel datapath

>and allow such flows (but mark the traffic as invalid early in

>conntrack processing and skip to the next action).

>

>So, to my question: How does the userspace handle a case like this?


The userspace datapath does what you're suggesting as b): in
conntrack_execute() if the packet is not ipv4 or ipv6 the packet is marked
as invalid (the key extraction fails).

Restricting this at the OpenFlow level means that it wouldn't be possible
to a have _single_ flow to send both ipv4 and ipv6 traffic to the connection
tracker.  Do we care about that?
Daniele Di Proietto April 27, 2016, 6:36 a.m. UTC | #5
Hi Antonio,

Thank you for your comments, replies inline




On 25/04/2016 09:14, "Fischetti, Antonio" <antonio.fischetti@intel.com> wrote:

>Hi Daniele, 

>some comments inline prefixed with [Antonio F].

>

>Regards,

>Antonio

>

>> -----Original Message-----

>> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of Daniele Di

>> Proietto

>> Sent: Saturday, April 16, 2016 1:03 AM

>> To: dev@openvswitch.org

>> Subject: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace connection

>> tracker.

>> 

>> This commit adds the conntrack module.

>> 

>> It is a connection tracker that resides entirely in userspace.  Its

>> primary user will be the dpif-netdev datapath.

>> 

>> The module main goal is to provide conntrack_execute(), which offers a

>> convenient interface to implement the datapath ct() action.

>> 

>> The conntrack module uses two submodules to deal with the l4 protocol

>> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

>> 

>> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf

>> subsystem, therefore it's BSD licensed.  It has been slightly altered to

>> match the OVS coding style and to allow the pickup of already

>> established connections.

>> 

>> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

>> ---

>>  COPYING                 |   1 +

>>  debian/copyright.in     |   4 +

>>  lib/automake.mk         |   5 +

>>  lib/conntrack-other.c   |  91 ++++++

>>  lib/conntrack-private.h |  77 +++++

>>  lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++

>>  lib/conntrack.c         | 851

>> ++++++++++++++++++++++++++++++++++++++++++++++++

>>  lib/conntrack.h         | 144 ++++++++

>>  8 files changed, 1649 insertions(+)

>>  create mode 100644 lib/conntrack-other.c

>>  create mode 100644 lib/conntrack-private.h

>>  create mode 100644 lib/conntrack-tcp.c

>>  create mode 100644 lib/conntrack.c

>>  create mode 100644 lib/conntrack.h

>> 

>> diff --git a/COPYING b/COPYING

>> index 308e3ea..afb98b9 100644

>> --- a/COPYING

>> +++ b/COPYING

>> @@ -25,6 +25,7 @@ License, version 2.

>>  The following files are licensed under the 2-clause BSD license.

>>      include/windows/getopt.h

>>      lib/getopt_long.c

>> +    lib/conntrack-tcp.c

>> 

>>  The following files are licensed under the 3-clause BSD-license

>>      include/windows/netinet/icmp6.h

>> diff --git a/debian/copyright.in b/debian/copyright.in

>> index 57d007a..a15f4dd 100644

>> --- a/debian/copyright.in

>> +++ b/debian/copyright.in

>> @@ -21,6 +21,9 @@ Upstream Copyright Holders:

>>  	Copyright (c) 2014 Michael Chapman

>>  	Copyright (c) 2014 WindRiver, Inc.

>>  	Copyright (c) 2014 Avaya, Inc.

>> +	Copyright (c) 2001 Daniel Hartmeier

>> +	Copyright (c) 2002 - 2008 Henning Brauer

>> +	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

>> 

>>  License:

>> 

>> @@ -90,6 +93,7 @@ License:

>>  	lib/getopt_long.c

>>  	include/windows/getopt.h

>>  	datapath-windows/ovsext/Conntrack-tcp.c

>> +	lib/conntrack-tcp.c

>> 

>>  * The following files are licensed under the 3-clause BSD-license

>> 

>> diff --git a/lib/automake.mk b/lib/automake.mk

>> index 1ec2115..ba30442 100644

>> --- a/lib/automake.mk

>> +++ b/lib/automake.mk

>> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \

>>  	lib/compiler.h \

>>  	lib/connectivity.c \

>>  	lib/connectivity.h \

>> +	lib/conntrack-private.h \

>> +	lib/conntrack-tcp.c \

>> +	lib/conntrack-other.c \

>> +	lib/conntrack.c \

>> +	lib/conntrack.h \

>>  	lib/coverage.c \

>>  	lib/coverage.h \

>>  	lib/crc32c.c \

>> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c

>> new file mode 100644

>> index 0000000..65d02a9

>> --- /dev/null

>> +++ b/lib/conntrack-other.c

>> @@ -0,0 +1,91 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

>> implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#include <config.h>

>> +

>> +#include "conntrack-private.h"

>> +#include "dp-packet.h"

>> +

>> +enum other_state {

>> +    OTHERS_FIRST,

>> +    OTHERS_MULTIPLE,

>> +    OTHERS_BIDIR,

>> +};

>> +

>> +struct conn_other {

>> +    struct conn up;

>> +    enum other_state state;

>> +};

>> +

>> +static const long long other_timeouts[] = {

>> +    [OTHERS_FIRST] = 60 * 1000,

>> +    [OTHERS_MULTIPLE] = 60 * 1000,

>> +    [OTHERS_BIDIR] = 30 * 1000,

>> +};

>> +

>> +static struct conn_other *

>> +conn_other_cast(const struct conn *conn)

>> +{

>> +    return CONTAINER_OF(conn, struct conn_other, up);

>> +}

>> +

>> +static void

>> +update_expiration(struct conn_other *conn, long long now)

>> +{

>> +    conn->up.expiration = now + other_timeouts[conn->state];

>> +}

>> +

>> +static enum ct_update_res

>> +other_conn_update(struct conn *conn_, struct dp_packet *pkt

>> OVS_UNUSED,

>> +                bool reply, long long now)

>> +{

>> +    struct conn_other *conn = conn_other_cast(conn_);

>> +

>> +    if (reply && conn->state != OTHERS_BIDIR) {

>> +        conn->state = OTHERS_BIDIR;

>> +    } else if (conn->state == OTHERS_FIRST) {

>> +        conn->state = OTHERS_MULTIPLE;

>> +    }

>> +

>> +    update_expiration(conn, now);

>> +

>> +    return CT_UPDATE_VALID;

>> +}

>> +

>> +static bool

>> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)

>> +{

>> +    return true;

>> +}

>> +

>> +static struct conn *

>> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)

>> +{

>> +    struct conn_other *conn;

>> +

>> +    conn = xzalloc(sizeof(struct conn_other));

>> +    conn->state = OTHERS_FIRST;

>> +

>> +    update_expiration(conn, now);

>> +

>> +    return &conn->up;

>> +}

>> +

>> +struct ct_l4_proto ct_proto_other = {

>> +    .new_conn = other_new_conn,

>> +    .valid_new = other_valid_new,

>> +    .conn_update = other_conn_update,

>> +};

>> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

>> new file mode 100644

>> index 0000000..e668c44

>> --- /dev/null

>> +++ b/lib/conntrack-private.h

>> @@ -0,0 +1,77 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

>> implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#ifndef CONNTRACK_PRIVATE_H

>> +#define CONNTRACK_PRIVATE_H 1

>> +

>> +#include <sys/types.h>

>> +#include <netinet/in.h>

>> +#include <netinet/ip6.h>

>> +

>> +#include "hmap.h"

>> +#include "openvswitch/types.h"

>> +#include "packets.h"

>> +#include "unaligned.h"

>> +

>> +struct ct_addr {

>> +    union {

>> +        ovs_16aligned_be32 ipv4;

>> +        union ovs_16aligned_in6_addr ipv6;

>> +        ovs_be32 ipv4_aligned;

>> +        struct in6_addr ipv6_aligned;

>> +    };

>> +};

>> +

>> +struct ct_endpoint {

>> +    struct ct_addr addr;

>> +    ovs_be16 port;

>> +};

>> +

>> +struct conn_key {

>> +    struct ct_endpoint src;

>> +    struct ct_endpoint dst;

>> +

>> +    ovs_be16 dl_type;

>> +    uint8_t nw_proto;

>> +    uint16_t zone;

>> +};

>> +

>> +struct conn {

>> +    struct conn_key key;

>> +    struct conn_key rev_key;

>> +    long long expiration;

>> +    struct hmap_node node;

>> +    uint32_t mark;

>> +    ovs_u128 label;

>> +};

>> +

>> +enum ct_update_res {

>> +    CT_UPDATE_INVALID,

>> +    CT_UPDATE_VALID,

>> +    CT_UPDATE_NEW,

>> +};

>> +

>> +struct ct_l4_proto {

>> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);

>> +    bool (*valid_new)(struct dp_packet *pkt);

>> +    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet

>> *pkt,

>> +                                      bool reply, long long now);

>> +};

>> +

>> +extern struct ct_l4_proto ct_proto_tcp;

>> +extern struct ct_l4_proto ct_proto_other;

>> +

>> +#endif /* conntrack-private.h */

>> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c

>> new file mode 100644

>> index 0000000..4d80038

>> --- /dev/null

>> +++ b/lib/conntrack-tcp.c

>> @@ -0,0 +1,476 @@

>> +/*-

>> + * Copyright (c) 2001 Daniel Hartmeier

>> + * Copyright (c) 2002 - 2008 Henning Brauer

>> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + * All rights reserved.

>> + *

>> + * Redistribution and use in source and binary forms, with or without

>> + * modification, are permitted provided that the following conditions

>> + * are met:

>> + *

>> + *    - Redistributions of source code must retain the above copyright

>> + *      notice, this list of conditions and the following disclaimer.

>> + *    - Redistributions in binary form must reproduce the above

>> + *      copyright notice, this list of conditions and the following

>> + *      disclaimer in the documentation and/or other materials provided

>> + *      with the distribution.

>> + *

>> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

>> CONTRIBUTORS

>> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

>> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

>> FITNESS

>> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

>> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,

>> INDIRECT,

>> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

>> (INCLUDING,

>> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

>> SERVICES;

>> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

>> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

>> STRICT

>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

>> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

>> + * POSSIBILITY OF SUCH DAMAGE.

>> + *

>> + * Effort sponsored in part by the Defense Advanced Research Projects

>> + * Agency (DARPA) and Air Force Research Laboratory, Air Force

>> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.

>> + *

>> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $

>> + */

>> +

>> +#include <config.h>

>> +

>> +#include "conntrack-private.h"

>> +#include "ct-dpif.h"

>> +#include "dp-packet.h"

>> +#include "util.h"

>> +

>> +struct tcp_peer {

>> +    enum ct_dpif_tcp_state state;

>> +    uint32_t               seqlo;          /* Max sequence number sent     */

>> +    uint32_t               seqhi;          /* Max the other end ACKd + win */

>> +    uint16_t               max_win;        /* largest window (pre scaling) */

>> +    uint8_t                wscale;         /* window scaling factor        */

>> +};

>> +

>> +struct conn_tcp {

>> +    struct conn up;

>> +    struct tcp_peer peer[2];

>> +};

>> +

>> +enum {

>> +    TCPOPT_EOL,

>> +    TCPOPT_NOP,

>> +    TCPOPT_WINDOW = 3,

>> +};

>> +

>> +/* TCP sequence numbers are 32 bit integers operated

>> + * on with modular arithmetic.  These macros can be

>> + * used to compare such integers. */

>> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)

>> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)

>> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)

>> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)

>> +

>> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))

>> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))

>> +

>> +static struct conn_tcp*

>> +conn_tcp_cast(const struct conn* conn)

>> +{

>> +    return CONTAINER_OF(conn, struct conn_tcp, up);

>> +}

>> +

>> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub

>> + * is enabled.  We're not scrubbing, but this check seems reasonable.  */

>> +static bool

>> +tcp_invalid_flags(uint16_t flags)

>> +{

>> +

>> +    if (flags & TCP_SYN) {

>> +        if (flags & TCP_RST) {

>> +            return true;

>> +        }

>> +        if (flags & TCP_FIN) {

>> +            /* Here pf removes the fin flag.  We simply mark the packet as

>> +             * invalid */

>> +            return true;

>> +        }

>> +    } else {

>> +        /* Illegal packet */

>> +        if (!(flags & (TCP_ACK|TCP_RST))) {

>> +            return true;

>> +        }

>> +    }

>> +

>> +    if (!(flags & TCP_ACK)) {

>> +        /* These flags are only valid if ACK is set */

>> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {

>> +            return true;

>> +        }

>> +    }

>> +

>> +    return false;

>> +}

>> +

>> +#define TCP_MAX_WSCALE 14

>> +#define CT_WSCALE_FLAG 0x80

>> +#define CT_WSCALE_UNKNOWN 0x40

>> +#define CT_WSCALE_MASK 0xf

>> +

>> +static uint8_t

>> +tcp_get_wscale(const struct tcp_header *tcp)

>> +{

>> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;

>> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);

>> +    uint8_t wscale = 0;

>> +    uint8_t optlen;

>> +

>> +    while (len >= 3) {

>> +        if (*opt == TCPOPT_EOL) {

>> +            break;

>> +        }

>> +        switch (*opt) {

>> +        case TCPOPT_NOP:

>> +            opt++;

>> +            len--;

>> +            break;

>> +        case TCPOPT_WINDOW:

>> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);

>> +            wscale |= CT_WSCALE_FLAG;

>> +            /* fall through */

>> +        default:

>> +            optlen = opt[1];

>> +            if (optlen < 2) {

>> +                optlen = 2;

>> +            }

>> +            len -= optlen;

>> +            opt += optlen;

>> +        }

>> +    }

>> +

>> +    return wscale;

>> +}

>> +

>> +static uint32_t

>> +tcp_payload_length(struct dp_packet *pkt)

>> +{

>> +    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)

>> +           - (char *) dp_packet_get_tcp_payload(pkt);

>> +}

>> +

>> +static void

>> +update_expiration(struct conn_tcp *conn, long long now, long long interval)

>> +{

>> +    conn->up.expiration = now + interval;

>> +}

>> +

>> +

>> +static enum ct_update_res

>> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,

>> +                long long now)

>> +{

>> +    struct conn_tcp *conn = conn_tcp_cast(conn_);

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    /* The peer that sent 'pkt' */

>> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];

>> +    /* The peer that should receive 'pkt' */

>> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];

>> +    uint8_t sws = 0, dws = 0;

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    uint16_t win = ntohs(tcp->tcp_winsz);

>> +    uint32_t ack, end, seq, orig_seq;

>> +    uint32_t p_len = tcp_payload_length(pkt);

>> +    int ackskew;

>> +

>> +    if (tcp_invalid_flags(tcp_flags)) {

>> +        return CT_UPDATE_INVALID;

>> +    }

>> +

>> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&

>> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&

>> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

>> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;

>> +        return CT_UPDATE_NEW;

>> +    }

>> +

>> +    if (src->wscale & CT_WSCALE_FLAG

>> +        && dst->wscale & CT_WSCALE_FLAG

>> +        && !(tcp_flags & TCP_SYN)) {

>> +

>> +        sws = src->wscale & CT_WSCALE_MASK;

>> +        dws = dst->wscale & CT_WSCALE_MASK;

>> +

>> +    } else if (src->wscale & CT_WSCALE_UNKNOWN

>> +        && dst->wscale & CT_WSCALE_UNKNOWN

>> +        && !(tcp_flags & TCP_SYN)) {

>> +

>> +        sws = TCP_MAX_WSCALE;

>> +        dws = TCP_MAX_WSCALE;

>> +    }

>> +

>> +    /*

>> +     * Sequence tracking algorithm from Guido van Rooij's paper:

>> +     *   http://www.madison-gurkha.com/publications/tcp_filtering/

>> +     *      tcp_filtering.ps

>> +     */

>> +

>> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));

>> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {

>> +        /* First packet from this end. Set its state */

>> +

>> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

>> +

>> +        end = seq + p_len;

>> +        if (tcp_flags & TCP_SYN) {

>> +            end++;

>> +            if (dst->wscale & CT_WSCALE_FLAG) {

>> +                src->wscale = tcp_get_wscale(tcp);

>> +                if (src->wscale & CT_WSCALE_FLAG) {

>> +                    /* Remove scale factor from initial window */

>> +                    sws = src->wscale & CT_WSCALE_MASK;

>> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);

>> +                    dws = dst->wscale & CT_WSCALE_MASK;

>> +                } else {

>> +                    /* fixup other window */

>> +                    dst->max_win <<= dst->wscale &

>> +                        CT_WSCALE_MASK;

>> +                    /* in case of a retrans SYN|ACK */

>> +                    dst->wscale = 0;

>> +                }

>> +            }

>> +        }

>> +        if (tcp_flags & TCP_FIN) {

>> +            end++;

>> +        }

>> +

>> +        src->seqlo = seq;

>> +        src->state = CT_DPIF_TCPS_SYN_SENT;

>> +        /*

>> +         * May need to slide the window (seqhi may have been set by

>> +         * the crappy stack check or if we picked up the connection

>> +         * after establishment)

>> +         */

>> +        if (src->seqhi == 1 ||

>> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {

>> +            src->seqhi = end + MAX(1, dst->max_win << dws);

>> +        }

>> +        if (win > src->max_win) {

>> +            src->max_win = win;

>> +        }

>> +

>> +    } else {

>> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

>> +        end = seq + p_len;

>> +        if (tcp_flags & TCP_SYN) {

>> +            end++;

>> +        }

>> +        if (tcp_flags & TCP_FIN) {

>> +            end++;

>> +        }

>> +    }

>> +

>> +    if ((tcp_flags & TCP_ACK) == 0) {

>> +        /* Let it pass through the ack skew check */

>> +        ack = dst->seqlo;

>> +    } else if ((ack == 0

>> +                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))

>> +               /* broken tcp stacks do not set ack */) {

>> +        /* Many stacks (ours included) will set the ACK number in an

>> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */

>> +        ack = dst->seqlo;

>> +    }

>> +

>> +    if (seq == end) {

>> +        /* Ease sequencing restrictions on no data packets */

>> +        seq = src->seqlo;

>> +        end = seq;

>> +    }

>> +

>> +    ackskew = dst->seqlo - ack;

>> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge

>> factor */

>> +    if (SEQ_GEQ(src->seqhi, end)

>> +        /* Last octet inside other's window space */

>> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))

>> +        /* Retrans: not more than one window back */

>> +        && (ackskew >= -MAXACKWINDOW)

>> +        /* Acking not more than one reassembled fragment backwards */

>> +        && (ackskew <= (MAXACKWINDOW << sws))

>> +        /* Acking not more than one window forward */

>> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo

>> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {

>> +        /* Require an exact/+1 sequence match on resets when possible */

>> +

>> +        /* update max window */

>> +        if (src->max_win < win) {

>> +            src->max_win = win;

>> +        }

>> +        /* synchronize sequencing */

>> +        if (SEQ_GT(end, src->seqlo)) {

>> +            src->seqlo = end;

>> +        }

>> +        /* slide the window of what the other end can send */

>> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

>> +            dst->seqhi = ack + MAX((win << sws), 1);

>> +        }

>> +

>> +        /* update states */

>> +        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {

>> +                src->state = CT_DPIF_TCPS_SYN_SENT;

>> +        }

>> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

>> +                src->state = CT_DPIF_TCPS_CLOSING;

>> +        }

>> +        if (tcp_flags & TCP_ACK) {

>> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {

>> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;

>> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {

>> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;

>> +            }

>> +        }

>> +        if (tcp_flags & TCP_RST) {

>> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

>> +        }

>> +

>> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2

>> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

>> +            update_expiration(conn, now, 30 * 1000);

>> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

>> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {

>> +            update_expiration(conn, now, 45 * 1000);

>> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED

>> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {

>> +            update_expiration(conn, now, 30 * 1000);

>> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

>> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {

>> +            update_expiration(conn, now, 15 * 60 * 1000);

>> +        } else {

>> +            update_expiration(conn, now, 24 * 60 * 60 * 1000);

>> +        }

>> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT

>> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2

>> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)

>> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)

>> +               /* Within a window forward of the originating packet */

>> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {

>> +               /* Within a window backward of the originating packet */

>> +

>> +        /*

>> +         * This currently handles three situations:

>> +         *  1) Stupid stacks will shotgun SYNs before their peer

>> +         *     replies.

>> +         *  2) When PF catches an already established stream (the

>> +         *     firewall rebooted, the state table was flushed, routes

>> +         *     changed...)

>> +         *  3) Packets get funky immediately after the connection

>> +         *     closes (this should catch Solaris spurious ACK|FINs

>> +         *     that web servers like to spew after a close)

>> +         *

>> +         * This must be a little more careful than the above code

>> +         * since packet floods will also be caught here. We don't

>> +         * update the TTL here to mitigate the damage of a packet

>> +         * flood and so the same code can handle awkward establishment

>> +         * and a loosened connection close.

>> +         * In the establishment case, a correct peer response will

>> +         * validate the connection, go through the normal state code

>> +         * and keep updating the state TTL.

>> +         */

>> +

>> +        /* update max window */

>> +        if (src->max_win < win) {

>> +            src->max_win = win;

>> +        }

>> +        /* synchronize sequencing */

>> +        if (SEQ_GT(end, src->seqlo)) {

>> +            src->seqlo = end;

>> +        }

>> +        /* slide the window of what the other end can send */

>> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

>> +            dst->seqhi = ack + MAX((win << sws), 1);

>> +        }

>> +

>> +        /*

>> +         * Cannot set dst->seqhi here since this could be a shotgunned

>> +         * SYN and not an already established connection.

>> +         */

>> +

>> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

>> +            src->state = CT_DPIF_TCPS_CLOSING;

>> +        }

>> +

>> +        if (tcp_flags & TCP_RST) {

>> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

>> +        }

>> +    } else {

>> +        return CT_UPDATE_INVALID;

>> +    }

>> +

>> +    return CT_UPDATE_VALID;

>> +}

>> +

>> +static bool

>> +tcp_valid_new(struct dp_packet *pkt)

>> +{

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    if (tcp_invalid_flags(tcp_flags)) {

>> +        return false;

>> +    }

>> +

>> +    /* A syn+ack is not allowed to create a connection.  We want to allow

>> +     * totally new connections (syn) or already established, not partially

>> +     * open (syn+ack). */

>> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +static struct conn *

>> +tcp_new_conn(struct dp_packet *pkt, long long now)

>> +{

>> +    struct conn_tcp* newconn = NULL;

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    struct tcp_peer *src, *dst;

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    newconn = xzalloc(sizeof(struct conn_tcp));

>> +

>> +    src = &newconn->peer[0];

>> +    dst = &newconn->peer[1];

>> +

>> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));

>> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;

>> +

>> +    if (tcp_flags & TCP_SYN) {

>> +        src->seqhi++;

>> +        src->wscale = tcp_get_wscale(tcp);

>> +    } else {

>> +        src->wscale = CT_WSCALE_UNKNOWN;

>> +        dst->wscale = CT_WSCALE_UNKNOWN;

>> +    }

>> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);

>> +    if (src->wscale & CT_WSCALE_MASK) {

>> +        /* Remove scale factor from initial window */

>> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;

>> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);

>> +    }

>> +    if (tcp_flags & TCP_FIN) {

>> +        src->seqhi++;

>> +    }

>> +    dst->seqhi = 1;

>> +    dst->max_win = 1;

>> +    src->state = CT_DPIF_TCPS_SYN_SENT;

>> +    dst->state = CT_DPIF_TCPS_CLOSED;

>> +

>> +    update_expiration(newconn, now, 30 * 1000);

>> +

>> +    return &newconn->up;

>> +}

>> +

>> +struct ct_l4_proto ct_proto_tcp = {

>> +    .new_conn = tcp_new_conn,

>> +    .valid_new = tcp_valid_new,

>> +    .conn_update = tcp_conn_update,

>> +};

>> diff --git a/lib/conntrack.c b/lib/conntrack.c

>> new file mode 100644

>> index 0000000..840335b

>> --- /dev/null

>> +++ b/lib/conntrack.c

>> @@ -0,0 +1,851 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

>> implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#include <config.h>

>> +#include "conntrack.h"

>> +

>> +#include <errno.h>

>> +#include <sys/types.h>

>> +#include <netinet/in.h>

>> +#include <netinet/icmp6.h>

>> +

>> +#include "bitmap.h"

>> +#include "conntrack-private.h"

>> +#include "dp-packet.h"

>> +#include "flow.h"

>> +#include "hmap.h"

>> +#include "netdev.h"

>> +#include "odp-netlink.h"

>> +#include "openvswitch/vlog.h"

>> +#include "ovs-rcu.h"

>> +#include "random.h"

>> +#include "timeval.h"

>> +

>> +VLOG_DEFINE_THIS_MODULE(conntrack);

>> +

>> +struct conn_lookup_ctx {

>> +    struct conn_key key;

>> +    struct conn *conn;

>> +    uint32_t hash;

>> +    bool reply;

>> +    bool related;

>> +};

>> +

>> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,

>> +                             struct conn_lookup_ctx *, uint16_t zone);

>> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);

>> +static void conn_key_reverse(struct conn_key *);

>> +static void conn_key_lookup(struct conntrack *ct,

>> +                            struct conn_lookup_ctx *ctx,

>> +                            unsigned bucket,

>> +                            long long now);

>> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);

>> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,

>> +                             long long now);

>> +static void delete_conn(struct conn *);

>> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*,

>> +                                      bool reply, long long now);

>> +static bool conn_expired(struct conn *, long long now);

>> +static void set_mark(struct dp_packet *, struct conn *,

>> +                     uint32_t val, uint32_t mask);

>> +static void set_label(struct dp_packet *, struct conn *,

>> +                      const struct ovs_key_ct_labels *val,

>> +                      const struct ovs_key_ct_labels *mask);

>> +

>> +static struct ct_l4_proto *l4_protos[] = {

>> +    [IPPROTO_TCP] = &ct_proto_tcp,

>> +    [IPPROTO_UDP] = &ct_proto_other,

>> +    [IPPROTO_ICMP] = &ct_proto_other,

>> +};

>> +

>> +/* Initializes the connection tracker 'ct'.  The caller is responbile for

>> + * calling 'conntrack_destroy()', when the instance is not needed anymore */

>> +void

>> +conntrack_init(struct conntrack *ct)

>> +{

>> +    unsigned i;

>> +

>> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

>> +        ct_lock_init(&ct->locks[i]);

>> +        ct_lock_lock(&ct->locks[i]);

>> +        hmap_init(&ct->connections[i]);

>> +        ct_lock_unlock(&ct->locks[i]);

>> +    }

>> +    ct->hash_basis = random_uint32();

>> +    ct->purge_bucket = 0;

>> +    ct->purge_inner_bucket = 0;

>> +    ct->purge_inner_offset = 0;

>> +}

>> +

>> +/* Destroys the connection tracker 'ct' and frees all the allocated memory. */

>> +void

>> +conntrack_destroy(struct conntrack *ct)

>> +{

>> +    unsigned i;

>> +

>> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

>> +        struct conn *conn, *next;

>> +

>> +        ct_lock_lock(&ct->locks[i]);

>> +        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {

>> +            hmap_remove(&ct->connections[i], &conn->node);

>> +            delete_conn(conn);

>> +        }

>> +        hmap_destroy(&ct->connections[i]);

>> +        ct_lock_unlock(&ct->locks[i]);

>> +        ct_lock_destroy(&ct->locks[i]);

>> +    }

>> +}

>> +

>

>> +static unsigned hash_to_bucket(uint32_t hash)

>> +{

>> +    /* Extracts the most significant bits in hash. The least significant bits

>> +     * are already used internally by the hmap implementation. */

>> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 &&

>> CONNTRACK_BUCKETS_SHIFT >= 1);

>> +

>> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) %

>> CONNTRACK_BUCKETS;

>> +}

>> +

>> +static void

>> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,

>> +            uint32_t mark, ovs_u128 label)

>> +{

>> +    pkt->md.ct_state = state | CS_TRACKED;

>> +    pkt->md.ct_zone = zone;

>> +    pkt->md.ct_mark = mark;

>> +    pkt->md.ct_label = label;

>> +}

>> +

>> +static struct conn *

>> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

>> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,

>> +               long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *nc = NULL;

>> +

>> +    if (!valid_new(pkt, &ctx->key)) {

>> +        *state |= CS_INVALID;

>> +        return nc;

>> +    }

>> +

>> +    *state |= CS_NEW;

>> +

>> +    if (commit) {

>> +        nc = new_conn(pkt, &ctx->key, now);

>> +

>> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);

>> +

>> +        conn_key_reverse(&nc->rev_key);

>> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);

>> +    }

>> +

>> +    return nc;

>> +}

>> +

>> +static struct conn *

>> +process_one(struct conntrack *ct, struct dp_packet *pkt,

>> +            struct conn_lookup_ctx *ctx, uint16_t zone,

>> +            bool commit, long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *conn = ctx->conn;

>> +    uint8_t state = 0;

>> +

>> +    if (conn) {

>> +        if (ctx->related) {

>> +            state |= CS_RELATED;

>> +            if (ctx->reply) {

>> +                state |= CS_REPLY_DIR;

>> +            }

>> +        } else {

>> +            enum ct_update_res res;

>> +

>> +            res = conn_update(conn, pkt, ctx->reply, now);

>> +

>> +            switch (res) {

>> +            case CT_UPDATE_VALID:

>> +                state |= CS_ESTABLISHED;

>> +                if (ctx->reply) {

>> +                    state |= CS_REPLY_DIR;

>> +                }

>> +                break;

>> +            case CT_UPDATE_INVALID:

>> +                state |= CS_INVALID;

>> +                break;

>> +            case CT_UPDATE_NEW:

>> +                hmap_remove(&ct->connections[bucket], &conn->node);

>> +                delete_conn(conn);

>> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>> +                break;

>

>[Antonio F]

>I see that conn_update can return just CT_UPDATE_VALID/INVALID/NEW

>but should we consider a 'default' case here, eg

>default:

>    state |= CS_INVALID;

>    break;

>?


I'm not too worried about that because if we add another value to the
enum without explicitly handling it here we will get a warning.

Is there a particular case you're worried about?

>

>

>> +            }

>> +        }

>> +

>> +        pkt->md.ct_label = conn->label;

>> +        pkt->md.ct_mark = conn->mark;

>

>[Antonio F]

>Can we skip the previous 2 lines as they are implemented into write_ct_md ?


Yes we can, thank you for spotting this!

>

>

>> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

>> +    } else {

>> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

>> +    }

>> +

>> +    return conn;

>> +}

>> +

>> +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker

>> + * 'ct'.  If 'commit' is true, the packets are allowed to create new entries

>> + * in the connection tables.  'setmark', if not NULL, should point to a two

>> + * elements array containing a value and a mask to set the connection mark.

>> + * 'setlabel' behaves similarly for the connection label.*/

>> +int

>> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,

>> +                  bool commit, uint16_t zone, const uint32_t *setmark,

>> +                  const struct ovs_key_ct_labels *setlabel,

>> +                  const char *helper)

>> +{

>> +#if !defined(__CHECKER__) && !defined(_WIN32)

>> +    const size_t KEY_ARRAY_SIZE = cnt;

>> +#else

>> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };

>> +#endif

>> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];

>> +    int8_t bucket_list[CONNTRACK_BUCKETS];

>> +    struct {

>> +        unsigned bucket;

>> +        unsigned long maps;

>> +    } arr[KEY_ARRAY_SIZE];

>> +    long long now = time_msec();

>> +    size_t i = 0;

>> +    uint8_t arrcnt = 0;

>> +

>> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >=

>> NETDEV_MAX_BURST);

>> +

>> +    if (helper) {

>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);

>> +

>> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);

>> +        /* Continue without the helper */

>> +    }

>> +

>> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);

>> +    for (i = 0; i < cnt; i++) {

>> +        unsigned bucket;

>> +

>> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {

>> +            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});

>> +            continue;

>> +        }

>> +

>> +        bucket = hash_to_bucket(ctxs[i].hash);

>> +        if (bucket_list[bucket] == INT8_C(-1)) {

>> +            bucket_list[bucket] = arrcnt;

>> +

>> +            arr[arrcnt].maps = 0;

>> +            ULLONG_SET1(arr[arrcnt].maps, i);

>> +            arr[arrcnt++].bucket = bucket;

>> +        } else {

>> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);

>> +            arr[bucket_list[bucket]].maps |= 1UL << i;

>> +        }

>> +    }

>> +

>> +    for (i = 0; i < arrcnt; i++) {

>> +        size_t j;

>> +

>> +        ct_lock_lock(&ct->locks[arr[i].bucket]);

>> +

>> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {

>> +            struct conn *conn;

>> +

>> +            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);

>> +

>> +            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);

>> +

>> +            if (conn && setmark) {

>> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);

>> +            }

>> +

>> +            if (conn && setlabel) {

>> +                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);

>> +            }

>> +        }

>> +        ct_lock_unlock(&ct->locks[arr[i].bucket]);

>> +    }

>> +

>> +    return 0;

>> +}

>> +

>> +static void

>> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t

>> mask)

>> +{

>> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));

>> +    conn->mark = pkt->md.ct_mark;

>> +}

>> +

>> +static void

>> +set_label(struct dp_packet *pkt, struct conn *conn,

>> +          const struct ovs_key_ct_labels *val,

>> +          const struct ovs_key_ct_labels *mask)

>> +{

>> +    ovs_u128 v, m;

>> +

>> +    memcpy(&v, val, sizeof v);

>> +    memcpy(&m, mask, sizeof m);

>> +

>> +    pkt->md.ct_label.u64.lo = v.u64.lo

>> +                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));

>> +    pkt->md.ct_label.u64.hi = v.u64.hi

>> +                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));

>> +    conn->label = pkt->md.ct_label;

>> +}

>> +

>

>> +#define CONNTRACK_PURGE_NUM 256

>> +

>> +static void

>> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,

>> +             uint32_t *inner_offset, unsigned *left, long long now)

>> +{

>> +    while (*left != 0) {

>> +        struct hmap_node *node;

>> +        struct conn *conn;

>> +

>> +        node = hmap_at_position(bucket, inner_bucket, inner_offset);

>> +

>> +        if (!node) {

>> +            hmap_shrink(bucket);

>> +            break;

>> +        }

>> +

>> +        INIT_CONTAINER(conn, node, node);

>> +        if (conn_expired(conn, now)) {

>> +            hmap_remove(bucket, &conn->node);

>> +            delete_conn(conn);

>> +            (*left)--;

>> +        }

>> +    }

>> +}

>> +

>> +/* Cleans up old connection entries.  Should be called periodically. */

>> +void

>> +conntrack_run(struct conntrack *ct)

>> +{

>> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);

>> +    uint32_t inner_bucket = ct->purge_inner_bucket,

>> +             inner_offset = ct->purge_inner_offset;

>> +    unsigned left = CONNTRACK_PURGE_NUM;

>> +    long long now = time_msec();

>> +

>> +    while (bucket < CONNTRACK_BUCKETS) {

>> +        ct_lock_lock(&ct->locks[bucket]);

>> +        sweep_bucket(&ct->connections[bucket],

>> +                     &inner_bucket, &inner_offset,

>> +                     &left, now);

>> +        ct_lock_unlock(&ct->locks[bucket]);

>> +

>> +        if (left == 0) {

>> +            break;

>> +        } else {

>> +            bucket++;

>> +        }

>> +    }

>> +

>> +    ct->purge_bucket = bucket;

>> +    ct->purge_inner_bucket = inner_bucket;

>> +    ct->purge_inner_offset = inner_offset;

>> +}

>> +

>

>> +/* Key extraction */

>> +

>> +/* The function stores a pointer to the first byte after the header in

>> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

>> + * not interested in the header's tail,  meaning that the header has

>> + * already been parsed (e.g. by flow_extract): we take this as a hint to

>> + * save a few checks. */

>> +static inline bool

>> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,

>> +                const char **new_data)

>> +{

>> +    const struct ip_header *ip = data;

>> +

>> +    if (new_data) {

>> +        size_t ip_len;

>> +

>> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {

>> +            return false;

>> +        }

>> +        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;

>> +

>> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {

>> +            return false;

>> +        }

>> +        if (OVS_UNLIKELY(size < ip_len)) {

>> +            return false;

>> +        }

>> +

>> +        *new_data = (char *) data + ip_len;

>> +    }

>> +

>> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.addr.ipv4 = ip->ip_src;

>> +    key->dst.addr.ipv4 = ip->ip_dst;

>> +    key->nw_proto = ip->ip_proto;

>> +

>> +    return true;

>> +}

>> +

>> +/* The function stores a pointer to the first byte after the header in

>> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

>> + * not interested in the header's tail,  meaning that the header has

>> + * already been parsed (e.g. by flow_extract): we take this as a hint to

>> + * save a few checks. */

>> +static inline bool

>> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,

>> +                const char **new_data)

>> +{

>> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;

>> +    uint8_t nw_proto = ip6->ip6_nxt;

>> +    uint8_t nw_frag = 0;

>> +

>> +    if (new_data) {

>> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {

>> +            return false;

>> +        }

>> +    }

>> +

>> +    data = ip6 + 1;

>> +    size -=  sizeof *ip6;

>> +

>> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {

>> +        return false;

>> +    }

>> +

>> +    if (new_data) {

>> +        *new_data = data;

>> +    }

>> +

>> +    if (nw_frag) {

>> +        return false;

>> +    }

>> +

>> +    key->src.addr.ipv6 = ip6->ip6_src;

>> +    key->dst.addr.ipv6 = ip6->ip6_dst;

>> +    key->nw_proto = nw_proto;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool

>> +check_l4_tcp(const void *data, size_t size)

>> +{

>> +    const struct tcp_header *tcp = data;

>> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;

>> +

>> +    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {

>> +        return true;

>> +    }

>> +

>> +    return false;

>> +}

>> +

>> +static inline bool

>> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)

>> +{

>> +    const struct tcp_header *tcp = data;

>> +

>> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.port = tcp->tcp_src;

>> +    key->dst.port = tcp->tcp_dst;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool

>> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)

>> +{

>> +    const struct udp_header *udp = data;

>> +

>> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.port = udp->udp_src;

>> +    key->dst.port = udp->udp_dst;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool extract_l4(struct conn_key *key, const void *data,

>> +                              size_t size, bool *related);

>> +

>> +/* If 'related' is not NULL and the function is processing an ICMP

>> + * error packet, extract the l3 and l4 fields from the nested header

>> + * instead and set *related to true.  If 'related' is NULL we're

>> + * already processing a nested header and no such recursion is

>> + * possible */

>> +static inline int

>> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

>> +                bool *related)

>> +{

>> +    const struct icmp_header *icmp = data;

>> +

>> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    switch (icmp->icmp_type) {

>> +    case ICMP4_ECHO_REQUEST:

>> +    case ICMP4_ECHO_REPLY:

>> +    case ICMP4_TIMESTAMP:

>> +    case ICMP4_TIMESTAMPREPLY:

>> +    case ICMP4_INFOREQUEST:

>> +    case ICMP4_INFOREPLY:

>> +        /* Separate ICMP connection: identified using id */

>> +        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;

>> +        break;

>> +    case ICMP4_DST_UNREACH:

>> +    case ICMP4_TIME_EXCEEDED:

>> +    case ICMP4_PARAM_PROB:

>> +    case ICMP4_SOURCEQUENCH:

>> +    case ICMP4_REDIRECT: {

>> +        /* ICMP packet part of another connection. We should

>> +         * extract the key from embedded packet header */

>> +        struct conn_key inner_key;

>> +        const char *l3 = (const char *) (icmp + 1);

>> +        const char *tail = (const char *) data + size;

>> +        const char *l4;

>> +        bool ok;

>> +

>> +        if (!related) {

>> +            return false;

>> +        }

>> +        *related = true;

>> +

>> +        memset(&inner_key, 0, sizeof inner_key);

>> +        inner_key.dl_type = htons(ETH_TYPE_IP);

>> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);

>> +        if (!ok) {

>> +            return false;

>> +        }

>> +

>> +        /* pf doesn't do this, but it seems a good idea */

>> +        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned

>> +            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {

>> +            return false;

>> +        }

>> +

>> +        key->src = inner_key.src;

>> +        key->dst = inner_key.dst;

>> +        key->nw_proto = inner_key.nw_proto;

>> +

>> +        ok = extract_l4(key, l4, tail - l4, NULL);

>> +        if (ok) {

>> +            conn_key_reverse(key);

>> +        }

>> +        return ok;

>> +    }

>> +    default:

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +/* If 'related' is not NULL and the function is processing an ICMP

>> + * error packet, extract the l3 and l4 fields from the nested header

>> + * instead and set *related to true.  If 'related' is NULL we're

>> + * already processing a nested header and no such recursion is

>> + * possible */

>> +static inline bool

>> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

>> +                 bool *related)

>> +{

>> +    const struct icmp6_header *icmp6 = data;

>> +

>> +    /* All the messages that we support need at least 4 bytes after

>> +     * the header */

>> +    if (size < sizeof *icmp6 + 4) {

>> +        return false;

>> +    }

>> +

>> +    switch (icmp6->icmp6_type) {

>> +    case ICMP6_ECHO_REQUEST:

>> +    case ICMP6_ECHO_REPLY:

>> +        /* Separate ICMP connection: identified using id */

>> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);

>> +        break;

>> +    case ICMP6_DST_UNREACH:

>> +    case ICMP6_PACKET_TOO_BIG:

>> +    case ICMP6_TIME_EXCEEDED:

>> +    case ICMP6_PARAM_PROB:{

>> +        /* ICMP packet part of another connection. We should

>> +         * extract the key from embedded packet header */

>> +        struct conn_key inner_key;

>> +        const char *l3 = (const char *) icmp6 + 8;

>> +        const char *tail = (const char *) data + size;

>> +        const char *l4 = NULL;

>> +        bool ok;

>> +

>> +        if (!related) {

>> +            return false;

>> +        }

>> +        *related = true;

>> +

>> +        memset(&inner_key, 0, sizeof inner_key);

>> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);

>> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);

>> +        if (!ok) {

>> +            return false;

>> +        }

>> +

>> +        /* pf doesn't do this, but it seems a good idea */

>> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,

>> +                              &key->dst.addr.ipv6_aligned)

>> +            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,

>> +                                 &key->src.addr.ipv6_aligned)) {

>> +            return false;

>> +        }

>> +

>> +        key->src = inner_key.src;

>> +        key->dst = inner_key.dst;

>> +        key->nw_proto = inner_key.nw_proto;

>> +

>> +        ok = extract_l4(key, l4, tail - l4, NULL);

>> +        if (ok) {

>> +            conn_key_reverse(key);

>> +        }

>> +        return ok;

>> +    }

>> +    default:

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +/* Extract l4 fields into 'key', which must already contain valid l3

>> + * members.  If 'related' is not NULL and an ICMP error packet is being

>> + * processed, the function will extract the key from the packet nested

>> + * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,

>> + * nested parsing isn't allowed.  This is necessary to limit the

>> + * recursion level. */

>> +static inline bool

>> +extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)

>> +{

>> +    if (key->nw_proto == IPPROTO_TCP) {

>> +        return extract_l4_tcp(key, data, size)

>> +               && (!related || check_l4_tcp(data, size));

>> +    } else if (key->nw_proto == IPPROTO_UDP) {

>> +        return extract_l4_udp(key, data, size);

>> +    } else if (key->dl_type == htons(ETH_TYPE_IP)

>> +               && key->nw_proto == IPPROTO_ICMP) {

>> +        return extract_l4_icmp(key, data, size, related);

>> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)

>> +               && key->nw_proto == IPPROTO_ICMPV6) {

>> +        return extract_l4_icmp6(key, data, size, related);

>> +    } else {

>> +        return false;

>> +    }

>> +}

>> +

>> +static bool

>> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,

>> +                 struct conn_lookup_ctx *ctx, uint16_t zone)

>> +{

>> +    const struct eth_header *l2 = dp_packet_l2(pkt);

>> +    const struct ip_header *l3 = dp_packet_l3(pkt);

>> +    const char *l4 = dp_packet_l4(pkt);

>> +    const char *tail = dp_packet_tail(pkt);

>> +    bool ok;

>> +

>> +    memset(ctx, 0, sizeof *ctx);

>> +

>> +    if (!l2 || !l3 || !l4) {

>> +        return false;

>> +    }

>> +

>> +    ctx->key.zone = zone;

>> +

>> +    /* XXX In this function we parse the packet (again, it has already

>> +     * gone through miniflow_extract()) for two reasons:

>> +     *

>> +     * 1) To extract the l3 addresses and l4 ports.

>> +     *    We already have the l3 and l4 headers' pointers.  Extracting

>> +     *    the l3 addresses and the l4 ports is really cheap, since they

>> +     *    can be found at fixed locations.

>> +     * 2) To extract the l3 and l4 types.

>> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the

>> +     *    other hand is quite expensive, because they're not at a

>> +     *    fixed location.

>> +     *

>> +     * Here's a way to avoid (2) with the help of the datapath.

>> +     * The datapath doesn't keep the packet's extracted flow[2], so

>> +     * using that is not an option.  We could use the packet's matching

>> +     * megaflow, but we have to make sure that the l3 and l4 types

>> +     * are unwildcarded.  This means either:

>> +     *

>> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow

>> +     *    is installed if the actions contains ct().  This is what the

>> +     *    kernel datapath does.  It is not so straightforward, though.

>> +     *

>> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when

>> +     *    translating a ct() action.  This is already done in different

>> +     *    actions and since both the userspace and the kernel datapath

>> +     *    would benefit from it, it seems an appropriate place to do

>> +     *    it.

>> +     *

>> +     * ---

>> +     * [1] A simple benchmark (running only the connection tracker

>> +     *     over and over on the same packets) shows that if the

>> +     *     l3 type is already provided we are 15% faster (running the

>> +     *     connection tracker over a couple of DPDK devices with a

>> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).

>> +     *

>> +     * [2] The reasons for this are that keeping the flow increases

>> +     *     (slightly) the cache footprint and increases computation

>> +     *     time as we move the packet around. Most importantly, the flow

>> +     *     should be updated by the actions and this can be slow, as

>> +     *     we use a sparse representation (miniflow).

>> +     *

>> +     */

>> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);

>> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {

>

>[Antonio F] In case we specify a flow with VLAN + CT like

>add_flow brX table=0,in_port=1,vlan_tci=0xf123,tcp,ct_state=-trk,action=ct(commit,zone=9),2

>should we also consider ETH_TYPE_VLAN in the previous conditional statement?


parse_dl_type() should skip vlans and return the L3 type. The connection tracker
is interested in what's beyond the vlan tag.

>

>

>> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);

>> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {

>> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);

>> +    } else {

>> +        ok = false;

>> +    }

>> +

>> +    if (ok) {

>> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {

>> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);

>> +            return true;

>> +        }

>> +    }

>> +

>> +    return false;

>> +}

>> +

>

>> +/* Symmetric */

>> +static uint32_t

>> +conn_key_hash(const struct conn_key *key, uint32_t basis)

>> +{

>> +    uint32_t hsrc, hdst, hash;

>> +    int i;

>> +

>> +    hsrc = hdst = basis;

>> +

>> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {

>> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);

>> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);

>> +    }

>> +

>> +    hash = hsrc ^ hdst;

>> +

>> +    hash = hash_words((uint32_t *) &key->dst + 1,

>> +                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),

>> +                      hash);

>

>[Antonio F]

>Is that to involve dl_type, nw_proto and zone in the hash computation?


Yes, I'll add a comment to make that clear

>

>

>> +

>> +    return hash;

>> +}

>> +

>> +static void

>> +conn_key_reverse(struct conn_key *key)

>> +{

>> +    struct ct_endpoint tmp;

>> +    tmp = key->src;

>> +    key->src = key->dst;

>> +    key->dst = tmp;

>> +}

>> +

>> +static void

>> +conn_key_lookup(struct conntrack *ct,

>> +                struct conn_lookup_ctx *ctx,

>> +                unsigned bucket,

>> +                long long now)

>> +{

>> +    struct conn *conn, *found = NULL;

>> +    uint32_t hash = ctx->hash;

>> +    bool reply;

>> +

>> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct-

>> >connections[bucket]) {

>> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {

>> +            found = conn;

>> +            reply = false;

>> +            break;

>> +        }

>> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {

>> +            found = conn;

>> +            reply = true;

>> +            break;

>> +        }

>> +    }

>> +

>> +    if (found) {

>> +        if (conn_expired(found, now)) {

>> +            found = NULL;

>> +        } else {

>> +            ctx->reply = reply;

>> +        }

>> +    }

>> +

>> +    ctx->conn = found;

>> +}

>> +

>> +static enum ct_update_res

>> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,

>> +            long long now)

>> +{

>> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply,

>> now);

>> +}

>> +

>> +static bool

>> +conn_expired(struct conn *conn, long long now)

>> +{

>> +    return now >= conn->expiration;

>> +}

>> +

>> +static bool

>> +valid_new(struct dp_packet *pkt, struct conn_key *key)

>> +{

>> +    return l4_protos[key->nw_proto]->valid_new(pkt);

>> +}

>> +

>> +static struct conn *

>> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)

>> +{

>> +    struct conn *newconn;

>> +

>> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);

>> +

>> +    if (newconn) {

>> +        newconn->key = *key;

>> +    }

>> +

>> +    return newconn;

>> +}

>> +

>> +static void

>> +delete_conn(struct conn *conn)

>> +{

>> +    free(conn);

>> +}

>> diff --git a/lib/conntrack.h b/lib/conntrack.h

>> new file mode 100644

>> index 0000000..8561273

>> --- /dev/null

>> +++ b/lib/conntrack.h

>> @@ -0,0 +1,144 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

>> implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#ifndef CONNTRACK_H

>> +#define CONNTRACK_H 1

>> +

>> +#include <stdbool.h>

>> +

>> +#include "hmap.h"

>> +#include "netdev-dpdk.h"

>> +#include "odp-netlink.h"

>> +#include "openvswitch/thread.h"

>> +#include "openvswitch/types.h"

>> +

>> +

>> +struct dp_packet;

>> +

>> +/* Userspace connection tracker

>> + * ============================

>> + *

>> + * This is a connection tracking module that keeps all the state in userspace.

>> + *

>> + * Usage

>> + * =====

>> + *

>> + *     struct conntract ct;

>> + *

>> + * Initialization:

>> + *

>> + *     conntrack_init(&ct);

>> + *

>> + * It is necessary to periodically issue a call to

>> + *

>> + *     conntrack_run(&ct);

>> + *

>> + * to allow the module to clean up expired connections.

>> + *

>> + * To send a group of packets through the connection tracker:

>> + *

>> + *     conntrack_execute(&ct, pkts, n_pkts, ...);

>> + *

>> + * Thread-safety

>> + * =============

>> + *

>> + * conntrack_execute() can be called by multiple threads simultaneoulsy.

>> + */

>> +

>> +struct conntrack;

>> +

>> +void conntrack_init(struct conntrack *);

>> +void conntrack_run(struct conntrack *);

>> +void conntrack_destroy(struct conntrack *);

>> +

>> +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,

>> +                      bool commit, uint16_t zone, const uint32_t *setmark,

>> +                      const struct ovs_key_ct_labels *setlabel,

>> +                      const char *helper);

>> +

>

>> +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */

>> +

>> +#ifdef DPDK_NETDEV

>> +struct OVS_LOCKABLE ct_lock {

>> +    rte_spinlock_t lock;

>> +};

>> +

>> +static inline void ct_lock_init(struct ct_lock *lock)

>> +{

>> +    rte_spinlock_init(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_lock(struct ct_lock *lock)

>> +    OVS_ACQUIRES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    rte_spinlock_lock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_unlock(struct ct_lock *lock)

>> +    OVS_RELEASES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    rte_spinlock_unlock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)

>> +{

>> +}

>> +#else

>> +struct OVS_LOCKABLE ct_lock {

>> +    struct ovs_mutex lock;

>> +};

>> +

>> +static inline void ct_lock_init(struct ct_lock *lock)

>> +{

>> +    ovs_mutex_init(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_lock(struct ct_lock *lock)

>> +    OVS_ACQUIRES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    ovs_mutex_lock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_unlock(struct ct_lock *lock)

>> +    OVS_RELEASES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    ovs_mutex_unlock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_destroy(struct ct_lock *lock)

>> +{

>> +    ovs_mutex_destroy(&lock->lock);

>> +}

>> +#endif

>> +

>

>> +#define CONNTRACK_BUCKETS_SHIFT 8

>> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)

>> +

>> +struct conntrack {

>> +    /* Each lock guards a 'connections' bucket */

>> +    struct ct_lock locks[CONNTRACK_BUCKETS];

>> +    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;

>> +    uint32_t hash_basis;

>> +    unsigned purge_bucket;

>> +    uint32_t purge_inner_bucket;

>> +    uint32_t purge_inner_offset;

>> +};

>> +

>> +#endif /* conntrack.h */

>> --

>> 2.1.4

>> 

>> _______________________________________________

>> dev mailing list

>> dev@openvswitch.org

>> http://openvswitch.org/mailman/listinfo/dev
Daniele Di Proietto April 27, 2016, 6:36 a.m. UTC | #6
Thank you for your comments, Flavio!



On 26/04/2016 16:35, "Flavio Leitner" <fbl@sysclose.org> wrote:

>On Fri, Apr 15, 2016 at 05:02:36PM -0700, Daniele Di Proietto wrote:

>> This commit adds the conntrack module.

>> 

>> It is a connection tracker that resides entirely in userspace.  Its

>> primary user will be the dpif-netdev datapath.

>> 

>> The module main goal is to provide conntrack_execute(), which offers a

>> convenient interface to implement the datapath ct() action.

>> 

>> The conntrack module uses two submodules to deal with the l4 protocol

>> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

>> 

>> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf

>> subsystem, therefore it's BSD licensed.  It has been slightly altered to

>> match the OVS coding style and to allow the pickup of already

>> established connections.

>> 

>> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

>> ---

>>  COPYING                 |   1 +

>>  debian/copyright.in     |   4 +

>>  lib/automake.mk         |   5 +

>>  lib/conntrack-other.c   |  91 ++++++

>>  lib/conntrack-private.h |  77 +++++

>>  lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++

>>  lib/conntrack.c         | 851 ++++++++++++++++++++++++++++++++++++++++++++++++

>>  lib/conntrack.h         | 144 ++++++++

>>  8 files changed, 1649 insertions(+)

>>  create mode 100644 lib/conntrack-other.c

>>  create mode 100644 lib/conntrack-private.h

>>  create mode 100644 lib/conntrack-tcp.c

>>  create mode 100644 lib/conntrack.c

>>  create mode 100644 lib/conntrack.h

>> 

>> diff --git a/COPYING b/COPYING

>> index 308e3ea..afb98b9 100644

>> --- a/COPYING

>> +++ b/COPYING

>> @@ -25,6 +25,7 @@ License, version 2.

>>  The following files are licensed under the 2-clause BSD license.

>>      include/windows/getopt.h

>>      lib/getopt_long.c

>> +    lib/conntrack-tcp.c

>>  

>>  The following files are licensed under the 3-clause BSD-license

>>      include/windows/netinet/icmp6.h

>> diff --git a/debian/copyright.in b/debian/copyright.in

>> index 57d007a..a15f4dd 100644

>> --- a/debian/copyright.in

>> +++ b/debian/copyright.in

>> @@ -21,6 +21,9 @@ Upstream Copyright Holders:

>>  	Copyright (c) 2014 Michael Chapman

>>  	Copyright (c) 2014 WindRiver, Inc.

>>  	Copyright (c) 2014 Avaya, Inc.

>> +	Copyright (c) 2001 Daniel Hartmeier

>> +	Copyright (c) 2002 - 2008 Henning Brauer

>> +	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

>>  

>>  License:

>>  

>> @@ -90,6 +93,7 @@ License:

>>  	lib/getopt_long.c

>>  	include/windows/getopt.h

>>  	datapath-windows/ovsext/Conntrack-tcp.c

>> +	lib/conntrack-tcp.c

>>  

>>  * The following files are licensed under the 3-clause BSD-license

>>  

>> diff --git a/lib/automake.mk b/lib/automake.mk

>> index 1ec2115..ba30442 100644

>> --- a/lib/automake.mk

>> +++ b/lib/automake.mk

>> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \

>>  	lib/compiler.h \

>>  	lib/connectivity.c \

>>  	lib/connectivity.h \

>> +	lib/conntrack-private.h \

>> +	lib/conntrack-tcp.c \

>> +	lib/conntrack-other.c \

>> +	lib/conntrack.c \

>> +	lib/conntrack.h \

>>  	lib/coverage.c \

>>  	lib/coverage.h \

>>  	lib/crc32c.c \

>> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c

>> new file mode 100644

>> index 0000000..65d02a9

>> --- /dev/null

>> +++ b/lib/conntrack-other.c

>> @@ -0,0 +1,91 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#include <config.h>

>> +

>> +#include "conntrack-private.h"

>> +#include "dp-packet.h"

>> +

>> +enum other_state {

>> +    OTHERS_FIRST,

>> +    OTHERS_MULTIPLE,

>> +    OTHERS_BIDIR,

>> +};

>> +

>> +struct conn_other {

>> +    struct conn up;

>> +    enum other_state state;

>> +};

>> +

>> +static const long long other_timeouts[] = {

>> +    [OTHERS_FIRST] = 60 * 1000,

>> +    [OTHERS_MULTIPLE] = 60 * 1000,

>> +    [OTHERS_BIDIR] = 30 * 1000,

>> +};

>

>I missed a description here, like the unit.


Right, I added a comment.

>

>> +

>> +static struct conn_other *

>> +conn_other_cast(const struct conn *conn)

>> +{

>> +    return CONTAINER_OF(conn, struct conn_other, up);

>> +}

>> +

>> +static void

>> +update_expiration(struct conn_other *conn, long long now)

>> +{

>> +    conn->up.expiration = now + other_timeouts[conn->state];

>> +}

>> +

>> +static enum ct_update_res

>> +other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,

>> +                bool reply, long long now)

>> +{

>> +    struct conn_other *conn = conn_other_cast(conn_);

>> +

>> +    if (reply && conn->state != OTHERS_BIDIR) {

>> +        conn->state = OTHERS_BIDIR;

>> +    } else if (conn->state == OTHERS_FIRST) {

>> +        conn->state = OTHERS_MULTIPLE;

>> +    }

>> +

>> +    update_expiration(conn, now);

>> +

>> +    return CT_UPDATE_VALID;

>> +}

>> +

>> +static bool

>> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)

>> +{

>> +    return true;

>> +}

>> +

>> +static struct conn *

>> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)

>> +{

>> +    struct conn_other *conn;

>> +

>> +    conn = xzalloc(sizeof(struct conn_other));

>> +    conn->state = OTHERS_FIRST;

>> +

>> +    update_expiration(conn, now);

>> +

>> +    return &conn->up;

>> +}

>> +

>> +struct ct_l4_proto ct_proto_other = {

>> +    .new_conn = other_new_conn,

>> +    .valid_new = other_valid_new,

>> +    .conn_update = other_conn_update,

>> +};

>> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

>> new file mode 100644

>> index 0000000..e668c44

>> --- /dev/null

>> +++ b/lib/conntrack-private.h

>> @@ -0,0 +1,77 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#ifndef CONNTRACK_PRIVATE_H

>> +#define CONNTRACK_PRIVATE_H 1

>> +

>> +#include <sys/types.h>

>> +#include <netinet/in.h>

>> +#include <netinet/ip6.h>

>> +

>> +#include "hmap.h"

>> +#include "openvswitch/types.h"

>> +#include "packets.h"

>> +#include "unaligned.h"

>> +

>> +struct ct_addr {

>> +    union {

>> +        ovs_16aligned_be32 ipv4;

>> +        union ovs_16aligned_in6_addr ipv6;

>> +        ovs_be32 ipv4_aligned;

>> +        struct in6_addr ipv6_aligned;

>> +    };

>> +};

>> +

>> +struct ct_endpoint {

>> +    struct ct_addr addr;

>> +    ovs_be16 port;

>> +};

>> +

>> +struct conn_key {

>> +    struct ct_endpoint src;

>> +    struct ct_endpoint dst;

>> +

>> +    ovs_be16 dl_type;

>> +    uint8_t nw_proto;

>> +    uint16_t zone;

>> +};

>

>Based on the above I presume we consider all IPs in the same space

>regardless of the vlan, correct?


Yes, I think this is what the kernel connection tracker does.

>

>

>> +

>> +struct conn {

>> +    struct conn_key key;

>> +    struct conn_key rev_key;

>> +    long long expiration;

>> +    struct hmap_node node;

>> +    uint32_t mark;

>> +    ovs_u128 label;

>> +};

>> +

>> +enum ct_update_res {

>> +    CT_UPDATE_INVALID,

>> +    CT_UPDATE_VALID,

>> +    CT_UPDATE_NEW,

>> +};

>> +

>> +struct ct_l4_proto {

>> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);

>> +    bool (*valid_new)(struct dp_packet *pkt);

>> +    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,

>> +                                      bool reply, long long now);

>> +};

>> +

>> +extern struct ct_l4_proto ct_proto_tcp;

>> +extern struct ct_l4_proto ct_proto_other;

>> +

>> +#endif /* conntrack-private.h */

>> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c

>> new file mode 100644

>> index 0000000..4d80038

>> --- /dev/null

>> +++ b/lib/conntrack-tcp.c

>> @@ -0,0 +1,476 @@

>> +/*-

>> + * Copyright (c) 2001 Daniel Hartmeier

>> + * Copyright (c) 2002 - 2008 Henning Brauer

>> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + * All rights reserved.

>> + *

>> + * Redistribution and use in source and binary forms, with or without

>> + * modification, are permitted provided that the following conditions

>> + * are met:

>> + *

>> + *    - Redistributions of source code must retain the above copyright

>> + *      notice, this list of conditions and the following disclaimer.

>> + *    - Redistributions in binary form must reproduce the above

>> + *      copyright notice, this list of conditions and the following

>> + *      disclaimer in the documentation and/or other materials provided

>> + *      with the distribution.

>> + *

>> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

>> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

>> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

>> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

>> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

>> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

>> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

>> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

>> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

>> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

>> + * POSSIBILITY OF SUCH DAMAGE.

>> + *

>> + * Effort sponsored in part by the Defense Advanced Research Projects

>> + * Agency (DARPA) and Air Force Research Laboratory, Air Force

>> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.

>> + *

>> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $

>> + */

>> +

>> +#include <config.h>

>> +

>> +#include "conntrack-private.h"

>> +#include "ct-dpif.h"

>> +#include "dp-packet.h"

>> +#include "util.h"

>> +

>> +struct tcp_peer {

>> +    enum ct_dpif_tcp_state state;

>> +    uint32_t               seqlo;          /* Max sequence number sent     */

>> +    uint32_t               seqhi;          /* Max the other end ACKd + win */

>> +    uint16_t               max_win;        /* largest window (pre scaling) */

>> +    uint8_t                wscale;         /* window scaling factor        */

>> +};

>> +

>> +struct conn_tcp {

>> +    struct conn up;

>> +    struct tcp_peer peer[2];

>> +};

>> +

>> +enum {

>> +    TCPOPT_EOL,

>> +    TCPOPT_NOP,

>> +    TCPOPT_WINDOW = 3,

>> +};

>> +

>> +/* TCP sequence numbers are 32 bit integers operated

>> + * on with modular arithmetic.  These macros can be

>> + * used to compare such integers. */

>> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)

>> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)

>> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)

>> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)

>> +

>> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))

>> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))

>

>I am okay to leave those here, but since they are generic

>operations, maybe we could define somewhere else and just

>redefine here as SEQ_ operations.


I am fine both ways. Do you have a suggestion for the name
of the generic macro?

>

>

>> +

>> +static struct conn_tcp*

>> +conn_tcp_cast(const struct conn* conn)

>> +{

>> +    return CONTAINER_OF(conn, struct conn_tcp, up);

>> +}

>> +

>> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub

>> + * is enabled.  We're not scrubbing, but this check seems reasonable.  */

>> +static bool

>> +tcp_invalid_flags(uint16_t flags)

>> +{

>> +

>> +    if (flags & TCP_SYN) {

>> +        if (flags & TCP_RST) {

>> +            return true;

>> +        }

>> +        if (flags & TCP_FIN) {

>> +            /* Here pf removes the fin flag.  We simply mark the packet as

>> +             * invalid */

>

>Maybe I missed something but it seems TH_FIN and TH_RST are

>treated equally (drop).  I'd vote to remove the comment.


Agreed, branches merged!

>

>> +            return true;

>> +        }

>> +    } else {

>> +        /* Illegal packet */

>> +        if (!(flags & (TCP_ACK|TCP_RST))) {

>> +            return true;

>> +        }

>> +    }

>> +

>> +    if (!(flags & TCP_ACK)) {

>> +        /* These flags are only valid if ACK is set */

>> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {

>> +            return true;

>> +        }

>> +    }

>> +

>> +    return false;

>> +}

>> +

>> +#define TCP_MAX_WSCALE 14

>> +#define CT_WSCALE_FLAG 0x80

>> +#define CT_WSCALE_UNKNOWN 0x40

>> +#define CT_WSCALE_MASK 0xf

>> +

>> +static uint8_t

>> +tcp_get_wscale(const struct tcp_header *tcp)

>> +{

>> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;

>> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);

>> +    uint8_t wscale = 0;

>> +    uint8_t optlen;

>> +

>> +    while (len >= 3) {

>> +        if (*opt == TCPOPT_EOL) {

>> +            break;

>> +        }

>> +        switch (*opt) {

>

>Maybe we could do:

>

>           case TCPOPT_EOL:

>               break;


I think 

The goal was to exit the while loop. I guess we could do

             case TCPOPT_EOL:
                 return wscale;     


>> +        case TCPOPT_NOP:

>> +            opt++;

>> +            len--;

>> +            break;

>> +        case TCPOPT_WINDOW:

>> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);

>> +            wscale |= CT_WSCALE_FLAG;

>> +            /* fall through */

>> +        default:

>> +            optlen = opt[1];

>> +            if (optlen < 2) {

>> +                optlen = 2;

>> +            }

>> +            len -= optlen;

>> +            opt += optlen;

>> +        }

>> +    }

>> +

>> +    return wscale;

>> +}

>> +

>> +static uint32_t

>> +tcp_payload_length(struct dp_packet *pkt)

>> +{

>> +    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)

>> +           - (char *) dp_packet_get_tcp_payload(pkt);

>> +}

>> +

>> +static void

>> +update_expiration(struct conn_tcp *conn, long long now, long long interval)

>> +{

>> +    conn->up.expiration = now + interval;

>> +}

>> +

>> +

>> +static enum ct_update_res

>> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,

>> +                long long now)

>> +{

>> +    struct conn_tcp *conn = conn_tcp_cast(conn_);

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    /* The peer that sent 'pkt' */

>> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];

>> +    /* The peer that should receive 'pkt' */

>> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];

>> +    uint8_t sws = 0, dws = 0;

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    uint16_t win = ntohs(tcp->tcp_winsz);

>> +    uint32_t ack, end, seq, orig_seq;

>> +    uint32_t p_len = tcp_payload_length(pkt);

>> +    int ackskew;

>> +

>> +    if (tcp_invalid_flags(tcp_flags)) {

>> +        return CT_UPDATE_INVALID;

>> +    }

>> +

>> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&

>> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&

>> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

>> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;

>> +        return CT_UPDATE_NEW;

>> +    }

>> +

>> +    if (src->wscale & CT_WSCALE_FLAG

>> +        && dst->wscale & CT_WSCALE_FLAG

>> +        && !(tcp_flags & TCP_SYN)) {

>> +

>> +        sws = src->wscale & CT_WSCALE_MASK;

>> +        dws = dst->wscale & CT_WSCALE_MASK;

>> +

>> +    } else if (src->wscale & CT_WSCALE_UNKNOWN

>> +        && dst->wscale & CT_WSCALE_UNKNOWN

>> +        && !(tcp_flags & TCP_SYN)) {

>> +

>> +        sws = TCP_MAX_WSCALE;

>> +        dws = TCP_MAX_WSCALE;

>> +    }

>> +

>> +    /*

>> +     * Sequence tracking algorithm from Guido van Rooij's paper:

>> +     *   http://www.madison-gurkha.com/publications/tcp_filtering/

>> +     *      tcp_filtering.ps

>> +     */

>> +

>> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));

>> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {

>> +        /* First packet from this end. Set its state */

>> +

>> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

>> +

>> +        end = seq + p_len;

>> +        if (tcp_flags & TCP_SYN) {

>> +            end++;

>> +            if (dst->wscale & CT_WSCALE_FLAG) {

>> +                src->wscale = tcp_get_wscale(tcp);

>> +                if (src->wscale & CT_WSCALE_FLAG) {

>> +                    /* Remove scale factor from initial window */

>> +                    sws = src->wscale & CT_WSCALE_MASK;

>> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);

>> +                    dws = dst->wscale & CT_WSCALE_MASK;

>> +                } else {

>> +                    /* fixup other window */

>> +                    dst->max_win <<= dst->wscale &

>> +                        CT_WSCALE_MASK;

>> +                    /* in case of a retrans SYN|ACK */

>> +                    dst->wscale = 0;

>> +                }

>> +            }

>> +        }

>> +        if (tcp_flags & TCP_FIN) {

>> +            end++;

>> +        }

>> +

>> +        src->seqlo = seq;

>> +        src->state = CT_DPIF_TCPS_SYN_SENT;

>> +        /*

>> +         * May need to slide the window (seqhi may have been set by

>> +         * the crappy stack check or if we picked up the connection

>> +         * after establishment)

>> +         */

>> +        if (src->seqhi == 1 ||

>> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {

>> +            src->seqhi = end + MAX(1, dst->max_win << dws);

>> +        }

>> +        if (win > src->max_win) {

>> +            src->max_win = win;

>> +        }

>> +

>> +    } else {

>> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

>> +        end = seq + p_len;

>> +        if (tcp_flags & TCP_SYN) {

>> +            end++;

>> +        }

>> +        if (tcp_flags & TCP_FIN) {

>> +            end++;

>> +        }

>> +    }

>> +

>> +    if ((tcp_flags & TCP_ACK) == 0) {

>> +        /* Let it pass through the ack skew check */

>> +        ack = dst->seqlo;

>> +    } else if ((ack == 0

>> +                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))

>> +               /* broken tcp stacks do not set ack */) {

>> +        /* Many stacks (ours included) will set the ACK number in an

>> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */

>> +        ack = dst->seqlo;

>> +    }

>> +

>> +    if (seq == end) {

>> +        /* Ease sequencing restrictions on no data packets */

>> +        seq = src->seqlo;

>> +        end = seq;

>> +    }

>> +

>> +    ackskew = dst->seqlo - ack;

>> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */

>> +    if (SEQ_GEQ(src->seqhi, end)

>> +        /* Last octet inside other's window space */

>> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))

>> +        /* Retrans: not more than one window back */

>> +        && (ackskew >= -MAXACKWINDOW)

>> +        /* Acking not more than one reassembled fragment backwards */

>> +        && (ackskew <= (MAXACKWINDOW << sws))

>> +        /* Acking not more than one window forward */

>> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo

>> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {

>> +        /* Require an exact/+1 sequence match on resets when possible */

>> +

>> +        /* update max window */

>> +        if (src->max_win < win) {

>> +            src->max_win = win;

>> +        }

>> +        /* synchronize sequencing */

>> +        if (SEQ_GT(end, src->seqlo)) {

>> +            src->seqlo = end;

>> +        }

>> +        /* slide the window of what the other end can send */

>> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

>> +            dst->seqhi = ack + MAX((win << sws), 1);

>> +        }

>> +

>> +        /* update states */

>> +        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {

>> +                src->state = CT_DPIF_TCPS_SYN_SENT;

>> +        }

>> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

>> +                src->state = CT_DPIF_TCPS_CLOSING;

>> +        }

>> +        if (tcp_flags & TCP_ACK) {

>> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {

>> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;

>> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {

>> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;

>> +            }

>> +        }

>> +        if (tcp_flags & TCP_RST) {

>> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

>> +        }

>> +

>> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2

>> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

>> +            update_expiration(conn, now, 30 * 1000);

>> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

>> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {

>> +            update_expiration(conn, now, 45 * 1000);

>> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED

>> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {

>> +            update_expiration(conn, now, 30 * 1000);

>> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

>> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {

>> +            update_expiration(conn, now, 15 * 60 * 1000);

>> +        } else {

>> +            update_expiration(conn, now, 24 * 60 * 60 * 1000);

>> +        }

>

>It would be nice to have defines for those timeout values.


You're right, I've added some defines for these.

>

>

>> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT

>> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2

>> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)

>> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)

>> +               /* Within a window forward of the originating packet */

>> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {

>> +               /* Within a window backward of the originating packet */

>> +

>> +        /*

>> +         * This currently handles three situations:

>> +         *  1) Stupid stacks will shotgun SYNs before their peer

>> +         *     replies.

>> +         *  2) When PF catches an already established stream (the

>> +         *     firewall rebooted, the state table was flushed, routes

>> +         *     changed...)

>> +         *  3) Packets get funky immediately after the connection

>> +         *     closes (this should catch Solaris spurious ACK|FINs

>> +         *     that web servers like to spew after a close)

>> +         *

>> +         * This must be a little more careful than the above code

>> +         * since packet floods will also be caught here. We don't

>> +         * update the TTL here to mitigate the damage of a packet

>> +         * flood and so the same code can handle awkward establishment

>> +         * and a loosened connection close.

>> +         * In the establishment case, a correct peer response will

>> +         * validate the connection, go through the normal state code

>> +         * and keep updating the state TTL.

>> +         */

>> +

>> +        /* update max window */

>> +        if (src->max_win < win) {

>> +            src->max_win = win;

>> +        }

>> +        /* synchronize sequencing */

>> +        if (SEQ_GT(end, src->seqlo)) {

>> +            src->seqlo = end;

>> +        }

>> +        /* slide the window of what the other end can send */

>> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

>> +            dst->seqhi = ack + MAX((win << sws), 1);

>> +        }

>> +

>> +        /*

>> +         * Cannot set dst->seqhi here since this could be a shotgunned

>> +         * SYN and not an already established connection.

>> +         */

>> +

>> +        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {

>> +            src->state = CT_DPIF_TCPS_CLOSING;

>> +        }

>> +

>> +        if (tcp_flags & TCP_RST) {

>> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

>> +        }

>> +    } else {

>> +        return CT_UPDATE_INVALID;

>> +    }

>> +

>> +    return CT_UPDATE_VALID;

>> +}

>> +

>> +static bool

>> +tcp_valid_new(struct dp_packet *pkt)

>> +{

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    if (tcp_invalid_flags(tcp_flags)) {

>> +        return false;

>> +    }

>> +

>> +    /* A syn+ack is not allowed to create a connection.  We want to allow

>> +     * totally new connections (syn) or already established, not partially

>> +     * open (syn+ack). */

>> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +static struct conn *

>> +tcp_new_conn(struct dp_packet *pkt, long long now)

>> +{

>> +    struct conn_tcp* newconn = NULL;

>> +    struct tcp_header *tcp = dp_packet_l4(pkt);

>> +    struct tcp_peer *src, *dst;

>> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

>> +

>> +    newconn = xzalloc(sizeof(struct conn_tcp));

>> +

>> +    src = &newconn->peer[0];

>> +    dst = &newconn->peer[1];

>> +

>> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));

>> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;

>> +

>> +    if (tcp_flags & TCP_SYN) {

>> +        src->seqhi++;

>> +        src->wscale = tcp_get_wscale(tcp);

>> +    } else {

>> +        src->wscale = CT_WSCALE_UNKNOWN;

>> +        dst->wscale = CT_WSCALE_UNKNOWN;

>> +    }

>> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);

>> +    if (src->wscale & CT_WSCALE_MASK) {

>> +        /* Remove scale factor from initial window */

>> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;

>> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);

>> +    }

>> +    if (tcp_flags & TCP_FIN) {

>> +        src->seqhi++;

>> +    }

>> +    dst->seqhi = 1;

>> +    dst->max_win = 1;

>> +    src->state = CT_DPIF_TCPS_SYN_SENT;

>> +    dst->state = CT_DPIF_TCPS_CLOSED;

>> +

>> +    update_expiration(newconn, now, 30 * 1000);

>> +

>> +    return &newconn->up;

>> +}

>> +

>> +struct ct_l4_proto ct_proto_tcp = {

>> +    .new_conn = tcp_new_conn,

>> +    .valid_new = tcp_valid_new,

>> +    .conn_update = tcp_conn_update,

>> +};

>> diff --git a/lib/conntrack.c b/lib/conntrack.c

>> new file mode 100644

>> index 0000000..840335b

>> --- /dev/null

>> +++ b/lib/conntrack.c

>> @@ -0,0 +1,851 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#include <config.h>

>> +#include "conntrack.h"

>> +

>> +#include <errno.h>

>> +#include <sys/types.h>

>> +#include <netinet/in.h>

>> +#include <netinet/icmp6.h>

>> +

>> +#include "bitmap.h"

>> +#include "conntrack-private.h"

>> +#include "dp-packet.h"

>> +#include "flow.h"

>> +#include "hmap.h"

>> +#include "netdev.h"

>> +#include "odp-netlink.h"

>> +#include "openvswitch/vlog.h"

>> +#include "ovs-rcu.h"

>> +#include "random.h"

>> +#include "timeval.h"

>> +

>> +VLOG_DEFINE_THIS_MODULE(conntrack);

>> +

>> +struct conn_lookup_ctx {

>> +    struct conn_key key;

>> +    struct conn *conn;

>> +    uint32_t hash;

>> +    bool reply;

>> +    bool related;

>> +};

>> +

>> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,

>> +                             struct conn_lookup_ctx *, uint16_t zone);

>> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);

>> +static void conn_key_reverse(struct conn_key *);

>> +static void conn_key_lookup(struct conntrack *ct,

>> +                            struct conn_lookup_ctx *ctx,

>> +                            unsigned bucket,

>> +                            long long now);

>> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);

>> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,

>> +                             long long now);

>> +static void delete_conn(struct conn *);

>> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*,

>> +                                      bool reply, long long now);

>> +static bool conn_expired(struct conn *, long long now);

>> +static void set_mark(struct dp_packet *, struct conn *,

>> +                     uint32_t val, uint32_t mask);

>> +static void set_label(struct dp_packet *, struct conn *,

>> +                      const struct ovs_key_ct_labels *val,

>> +                      const struct ovs_key_ct_labels *mask);

>> +

>> +static struct ct_l4_proto *l4_protos[] = {

>> +    [IPPROTO_TCP] = &ct_proto_tcp,

>> +    [IPPROTO_UDP] = &ct_proto_other,

>> +    [IPPROTO_ICMP] = &ct_proto_other,

>> +};

>> +

>> +/* Initializes the connection tracker 'ct'.  The caller is responbile for

>> + * calling 'conntrack_destroy()', when the instance is not needed anymore */

>> +void

>> +conntrack_init(struct conntrack *ct)

>> +{

>> +    unsigned i;

>> +

>> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

>> +        ct_lock_init(&ct->locks[i]);

>> +        ct_lock_lock(&ct->locks[i]);

>> +        hmap_init(&ct->connections[i]);

>> +        ct_lock_unlock(&ct->locks[i]);

>> +    }

>> +    ct->hash_basis = random_uint32();

>> +    ct->purge_bucket = 0;

>> +    ct->purge_inner_bucket = 0;

>> +    ct->purge_inner_offset = 0;

>> +}

>> +

>> +/* Destroys the connection tracker 'ct' and frees all the allocated memory. */

>> +void

>> +conntrack_destroy(struct conntrack *ct)

>> +{

>> +    unsigned i;

>> +

>> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

>> +        struct conn *conn, *next;

>> +

>> +        ct_lock_lock(&ct->locks[i]);

>> +        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {

>> +            hmap_remove(&ct->connections[i], &conn->node);

>> +            delete_conn(conn);

>> +        }

>> +        hmap_destroy(&ct->connections[i]);

>> +        ct_lock_unlock(&ct->locks[i]);

>> +        ct_lock_destroy(&ct->locks[i]);

>> +    }

>> +}

>> +?

>> +static unsigned hash_to_bucket(uint32_t hash)

>> +{

>> +    /* Extracts the most significant bits in hash. The least significant bits

>> +     * are already used internally by the hmap implementation. */

>> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);

>> +

>> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;

>> +}

>> +

>> +static void

>> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,

>> +            uint32_t mark, ovs_u128 label)

>> +{

>> +    pkt->md.ct_state = state | CS_TRACKED;

>> +    pkt->md.ct_zone = zone;

>> +    pkt->md.ct_mark = mark;

>> +    pkt->md.ct_label = label;

>> +}

>> +

>> +static struct conn *

>> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

>> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,

>> +               long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *nc = NULL;

>> +

>> +    if (!valid_new(pkt, &ctx->key)) {

>> +        *state |= CS_INVALID;

>> +        return nc;

>> +    }

>> +

>> +    *state |= CS_NEW;

>> +

>> +    if (commit) {

>> +        nc = new_conn(pkt, &ctx->key, now);

>> +

>> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);

>> +

>> +        conn_key_reverse(&nc->rev_key);

>> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);

>> +    }

>> +

>> +    return nc;

>> +}

>> +

>> +static struct conn *

>> +process_one(struct conntrack *ct, struct dp_packet *pkt,

>> +            struct conn_lookup_ctx *ctx, uint16_t zone,

>> +            bool commit, long long now)

>> +{

>> +    unsigned bucket = hash_to_bucket(ctx->hash);

>> +    struct conn *conn = ctx->conn;

>> +    uint8_t state = 0;

>> +

>> +    if (conn) {

>> +        if (ctx->related) {

>> +            state |= CS_RELATED;

>> +            if (ctx->reply) {

>> +                state |= CS_REPLY_DIR;

>> +            }

>> +        } else {

>> +            enum ct_update_res res;

>> +

>> +            res = conn_update(conn, pkt, ctx->reply, now);

>> +

>> +            switch (res) {

>> +            case CT_UPDATE_VALID:

>> +                state |= CS_ESTABLISHED;

>> +                if (ctx->reply) {

>> +                    state |= CS_REPLY_DIR;

>> +                }

>> +                break;

>> +            case CT_UPDATE_INVALID:

>> +                state |= CS_INVALID;

>> +                break;

>> +            case CT_UPDATE_NEW:

>> +                hmap_remove(&ct->connections[bucket], &conn->node);

>> +                delete_conn(conn);

>> +                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>> +                break;

>> +            }

>> +        }

>> +

>> +        pkt->md.ct_label = conn->label;

>> +        pkt->md.ct_mark = conn->mark;

>> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

>> +    } else {

>> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

>> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

>> +    }

>> +

>> +    return conn;

>> +}

>> +

>> +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker

>> + * 'ct'.  If 'commit' is true, the packets are allowed to create new entries

>> + * in the connection tables.  'setmark', if not NULL, should point to a two

>> + * elements array containing a value and a mask to set the connection mark.

>> + * 'setlabel' behaves similarly for the connection label.*/

>> +int

>> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,

>> +                  bool commit, uint16_t zone, const uint32_t *setmark,

>> +                  const struct ovs_key_ct_labels *setlabel,

>> +                  const char *helper)

>> +{

>> +#if !defined(__CHECKER__) && !defined(_WIN32)

>> +    const size_t KEY_ARRAY_SIZE = cnt;

>> +#else

>> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };

>> +#endif

>> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];

>> +    int8_t bucket_list[CONNTRACK_BUCKETS];

>> +    struct {

>> +        unsigned bucket;

>> +        unsigned long maps;

>> +    } arr[KEY_ARRAY_SIZE];

>> +    long long now = time_msec();

>> +    size_t i = 0;

>> +    uint8_t arrcnt = 0;

>> +

>> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);

>> +

>> +    if (helper) {

>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);

>> +

>> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);

>> +        /* Continue without the helper */

>> +    }

>> +

>> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);

>> +    for (i = 0; i < cnt; i++) {

>> +        unsigned bucket;

>> +

>> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {

>> +            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});

>> +            continue;

>> +        }

>> +

>> +        bucket = hash_to_bucket(ctxs[i].hash);

>> +        if (bucket_list[bucket] == INT8_C(-1)) {

>> +            bucket_list[bucket] = arrcnt;

>> +

>> +            arr[arrcnt].maps = 0;

>> +            ULLONG_SET1(arr[arrcnt].maps, i);

>> +            arr[arrcnt++].bucket = bucket;

>> +        } else {

>> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);

>> +            arr[bucket_list[bucket]].maps |= 1UL << i;

>> +        }

>> +    }

>> +

>> +    for (i = 0; i < arrcnt; i++) {

>> +        size_t j;

>> +

>> +        ct_lock_lock(&ct->locks[arr[i].bucket]);

>> +

>> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {

>> +            struct conn *conn;

>> +

>> +            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);

>> +

>> +            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);

>> +

>> +            if (conn && setmark) {

>> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);

>> +            }

>> +

>> +            if (conn && setlabel) {

>> +                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);

>> +            }

>> +        }

>> +        ct_lock_unlock(&ct->locks[arr[i].bucket]);

>> +    }

>> +

>> +    return 0;

>> +}

>> +

>> +static void

>> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)

>> +{

>> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));

>> +    conn->mark = pkt->md.ct_mark;

>> +}

>> +

>> +static void

>> +set_label(struct dp_packet *pkt, struct conn *conn,

>> +          const struct ovs_key_ct_labels *val,

>> +          const struct ovs_key_ct_labels *mask)

>> +{

>> +    ovs_u128 v, m;

>> +

>> +    memcpy(&v, val, sizeof v);

>> +    memcpy(&m, mask, sizeof m);

>> +

>> +    pkt->md.ct_label.u64.lo = v.u64.lo

>> +                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));

>> +    pkt->md.ct_label.u64.hi = v.u64.hi

>> +                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));

>> +    conn->label = pkt->md.ct_label;

>> +}

>> +?

>> +#define CONNTRACK_PURGE_NUM 256

>> +

>> +static void

>> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,

>> +             uint32_t *inner_offset, unsigned *left, long long now)

>> +{

>> +    while (*left != 0) {

>> +        struct hmap_node *node;

>> +        struct conn *conn;

>> +

>> +        node = hmap_at_position(bucket, inner_bucket, inner_offset);

>> +

>> +        if (!node) {

>> +            hmap_shrink(bucket);

>> +            break;

>> +        }

>> +

>> +        INIT_CONTAINER(conn, node, node);

>> +        if (conn_expired(conn, now)) {

>> +            hmap_remove(bucket, &conn->node);

>> +            delete_conn(conn);

>> +            (*left)--;

>> +        }

>> +    }

>> +}

>> +

>> +/* Cleans up old connection entries.  Should be called periodically. */

>> +void

>> +conntrack_run(struct conntrack *ct)

>> +{

>> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);

>> +    uint32_t inner_bucket = ct->purge_inner_bucket,

>> +             inner_offset = ct->purge_inner_offset;

>> +    unsigned left = CONNTRACK_PURGE_NUM;

>> +    long long now = time_msec();

>> +

>> +    while (bucket < CONNTRACK_BUCKETS) {

>> +        ct_lock_lock(&ct->locks[bucket]);

>> +        sweep_bucket(&ct->connections[bucket],

>> +                     &inner_bucket, &inner_offset,

>> +                     &left, now);

>> +        ct_lock_unlock(&ct->locks[bucket]);

>> +

>> +        if (left == 0) {

>> +            break;

>> +        } else {

>> +            bucket++;

>> +        }

>> +    }

>> +

>> +    ct->purge_bucket = bucket;

>> +    ct->purge_inner_bucket = inner_bucket;

>> +    ct->purge_inner_offset = inner_offset;

>> +}

>> +?

>> +/* Key extraction */

>> +

>> +/* The function stores a pointer to the first byte after the header in

>> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

>> + * not interested in the header's tail,  meaning that the header has

>> + * already been parsed (e.g. by flow_extract): we take this as a hint to

>> + * save a few checks. */

>> +static inline bool

>> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,

>> +                const char **new_data)

>> +{

>> +    const struct ip_header *ip = data;

>> +

>> +    if (new_data) {

>> +        size_t ip_len;

>> +

>> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {

>> +            return false;

>> +        }

>> +        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;

>> +

>> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {

>> +            return false;

>> +        }

>> +        if (OVS_UNLIKELY(size < ip_len)) {

>> +            return false;

>> +        }

>> +

>> +        *new_data = (char *) data + ip_len;

>> +    }

>> +

>> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.addr.ipv4 = ip->ip_src;

>> +    key->dst.addr.ipv4 = ip->ip_dst;

>> +    key->nw_proto = ip->ip_proto;

>> +

>> +    return true;

>> +}

>> +

>> +/* The function stores a pointer to the first byte after the header in

>> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is

>> + * not interested in the header's tail,  meaning that the header has

>> + * already been parsed (e.g. by flow_extract): we take this as a hint to

>> + * save a few checks. */

>> +static inline bool

>> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,

>> +                const char **new_data)

>> +{

>> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;

>> +    uint8_t nw_proto = ip6->ip6_nxt;

>> +    uint8_t nw_frag = 0;

>> +

>> +    if (new_data) {

>> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {

>> +            return false;

>> +        }

>> +    }

>> +

>> +    data = ip6 + 1;

>> +    size -=  sizeof *ip6;

>> +

>> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {

>> +        return false;

>> +    }

>> +

>> +    if (new_data) {

>> +        *new_data = data;

>> +    }

>> +

>> +    if (nw_frag) {

>> +        return false;

>> +    }

>> +

>> +    key->src.addr.ipv6 = ip6->ip6_src;

>> +    key->dst.addr.ipv6 = ip6->ip6_dst;

>> +    key->nw_proto = nw_proto;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool

>> +check_l4_tcp(const void *data, size_t size)

>> +{

>> +    const struct tcp_header *tcp = data;

>> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;

>> +

>> +    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {

>> +        return true;

>> +    }

>> +

>> +    return false;

>> +}

>> +

>> +static inline bool

>> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)

>> +{

>> +    const struct tcp_header *tcp = data;

>> +

>> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.port = tcp->tcp_src;

>> +    key->dst.port = tcp->tcp_dst;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool

>> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)

>> +{

>> +    const struct udp_header *udp = data;

>> +

>> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    key->src.port = udp->udp_src;

>> +    key->dst.port = udp->udp_dst;

>> +

>> +    return true;

>> +}

>> +

>> +static inline bool extract_l4(struct conn_key *key, const void *data,

>> +                              size_t size, bool *related);

>> +

>> +/* If 'related' is not NULL and the function is processing an ICMP

>> + * error packet, extract the l3 and l4 fields from the nested header

>> + * instead and set *related to true.  If 'related' is NULL we're

>> + * already processing a nested header and no such recursion is

>> + * possible */

>> +static inline int

>> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

>> +                bool *related)

>> +{

>> +    const struct icmp_header *icmp = data;

>> +

>> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {

>> +        return false;

>> +    }

>> +

>> +    switch (icmp->icmp_type) {

>> +    case ICMP4_ECHO_REQUEST:

>> +    case ICMP4_ECHO_REPLY:

>> +    case ICMP4_TIMESTAMP:

>> +    case ICMP4_TIMESTAMPREPLY:

>> +    case ICMP4_INFOREQUEST:

>> +    case ICMP4_INFOREPLY:

>> +        /* Separate ICMP connection: identified using id */

>> +        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;

>> +        break;

>> +    case ICMP4_DST_UNREACH:

>> +    case ICMP4_TIME_EXCEEDED:

>> +    case ICMP4_PARAM_PROB:

>> +    case ICMP4_SOURCEQUENCH:

>> +    case ICMP4_REDIRECT: {

>> +        /* ICMP packet part of another connection. We should

>> +         * extract the key from embedded packet header */

>> +        struct conn_key inner_key;

>> +        const char *l3 = (const char *) (icmp + 1);

>> +        const char *tail = (const char *) data + size;

>> +        const char *l4;

>> +        bool ok;

>> +

>> +        if (!related) {

>> +            return false;

>> +        }

>> +        *related = true;

>> +

>> +        memset(&inner_key, 0, sizeof inner_key);

>> +        inner_key.dl_type = htons(ETH_TYPE_IP);

>> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);

>> +        if (!ok) {

>> +            return false;

>> +        }

>> +

>> +        /* pf doesn't do this, but it seems a good idea */

>> +        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned

>> +            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {

>> +            return false;

>> +        }

>> +

>> +        key->src = inner_key.src;

>> +        key->dst = inner_key.dst;

>> +        key->nw_proto = inner_key.nw_proto;

>> +

>> +        ok = extract_l4(key, l4, tail - l4, NULL);

>> +        if (ok) {

>> +            conn_key_reverse(key);

>> +        }

>> +        return ok;

>> +    }

>> +    default:

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +/* If 'related' is not NULL and the function is processing an ICMP

>> + * error packet, extract the l3 and l4 fields from the nested header

>> + * instead and set *related to true.  If 'related' is NULL we're

>> + * already processing a nested header and no such recursion is

>> + * possible */

>> +static inline bool

>> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

>> +                 bool *related)

>> +{

>> +    const struct icmp6_header *icmp6 = data;

>> +

>> +    /* All the messages that we support need at least 4 bytes after

>> +     * the header */

>> +    if (size < sizeof *icmp6 + 4) {

>> +        return false;

>> +    }

>> +

>> +    switch (icmp6->icmp6_type) {

>> +    case ICMP6_ECHO_REQUEST:

>> +    case ICMP6_ECHO_REPLY:

>> +        /* Separate ICMP connection: identified using id */

>> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);

>> +        break;

>> +    case ICMP6_DST_UNREACH:

>> +    case ICMP6_PACKET_TOO_BIG:

>> +    case ICMP6_TIME_EXCEEDED:

>> +    case ICMP6_PARAM_PROB:{

>> +        /* ICMP packet part of another connection. We should

>> +         * extract the key from embedded packet header */

>> +        struct conn_key inner_key;

>> +        const char *l3 = (const char *) icmp6 + 8;

>> +        const char *tail = (const char *) data + size;

>> +        const char *l4 = NULL;

>> +        bool ok;

>> +

>> +        if (!related) {

>> +            return false;

>> +        }

>> +        *related = true;

>> +

>> +        memset(&inner_key, 0, sizeof inner_key);

>> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);

>> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);

>> +        if (!ok) {

>> +            return false;

>> +        }

>> +

>> +        /* pf doesn't do this, but it seems a good idea */

>> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,

>> +                              &key->dst.addr.ipv6_aligned)

>> +            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,

>> +                                 &key->src.addr.ipv6_aligned)) {

>> +            return false;

>> +        }

>> +

>> +        key->src = inner_key.src;

>> +        key->dst = inner_key.dst;

>> +        key->nw_proto = inner_key.nw_proto;

>> +

>> +        ok = extract_l4(key, l4, tail - l4, NULL);

>> +        if (ok) {

>> +            conn_key_reverse(key);

>> +        }

>> +        return ok;

>> +    }

>> +    default:

>> +        return false;

>> +    }

>> +

>> +    return true;

>> +}

>> +

>> +/* Extract l4 fields into 'key', which must already contain valid l3

>> + * members.  If 'related' is not NULL and an ICMP error packet is being

>> + * processed, the function will extract the key from the packet nested

>> + * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,

>> + * nested parsing isn't allowed.  This is necessary to limit the

>> + * recursion level. */

>> +static inline bool

>> +extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)

>> +{

>> +    if (key->nw_proto == IPPROTO_TCP) {

>> +        return extract_l4_tcp(key, data, size)

>> +               && (!related || check_l4_tcp(data, size));

>> +    } else if (key->nw_proto == IPPROTO_UDP) {

>> +        return extract_l4_udp(key, data, size);

>> +    } else if (key->dl_type == htons(ETH_TYPE_IP)

>> +               && key->nw_proto == IPPROTO_ICMP) {

>> +        return extract_l4_icmp(key, data, size, related);

>> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)

>> +               && key->nw_proto == IPPROTO_ICMPV6) {

>> +        return extract_l4_icmp6(key, data, size, related);

>> +    } else {

>> +        return false;

>> +    }

>> +}

>> +

>> +static bool

>> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,

>> +                 struct conn_lookup_ctx *ctx, uint16_t zone)

>> +{

>> +    const struct eth_header *l2 = dp_packet_l2(pkt);

>> +    const struct ip_header *l3 = dp_packet_l3(pkt);

>> +    const char *l4 = dp_packet_l4(pkt);

>> +    const char *tail = dp_packet_tail(pkt);

>> +    bool ok;

>> +

>> +    memset(ctx, 0, sizeof *ctx);

>> +

>> +    if (!l2 || !l3 || !l4) {

>> +        return false;

>> +    }

>> +

>> +    ctx->key.zone = zone;

>> +

>> +    /* XXX In this function we parse the packet (again, it has already

>> +     * gone through miniflow_extract()) for two reasons:

>> +     *

>> +     * 1) To extract the l3 addresses and l4 ports.

>> +     *    We already have the l3 and l4 headers' pointers.  Extracting

>> +     *    the l3 addresses and the l4 ports is really cheap, since they

>> +     *    can be found at fixed locations.

>> +     * 2) To extract the l3 and l4 types.

>> +     *    Extracting the l3 and l4 types (especially the l3[1]) on the

>> +     *    other hand is quite expensive, because they're not at a

>> +     *    fixed location.

>> +     *

>> +     * Here's a way to avoid (2) with the help of the datapath.

>> +     * The datapath doesn't keep the packet's extracted flow[2], so

>> +     * using that is not an option.  We could use the packet's matching

>> +     * megaflow, but we have to make sure that the l3 and l4 types

>> +     * are unwildcarded.  This means either:

>> +     *

>> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow

>> +     *    is installed if the actions contains ct().  This is what the

>> +     *    kernel datapath does.  It is not so straightforward, though.

>> +     *

>> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when

>> +     *    translating a ct() action.  This is already done in different

>> +     *    actions and since both the userspace and the kernel datapath

>> +     *    would benefit from it, it seems an appropriate place to do

>> +     *    it.

>> +     *

>> +     * ---

>> +     * [1] A simple benchmark (running only the connection tracker

>> +     *     over and over on the same packets) shows that if the

>> +     *     l3 type is already provided we are 15% faster (running the

>> +     *     connection tracker over a couple of DPDK devices with a

>> +     *     stream of UDP 64-bytes packets shows that we are 4% faster).

>> +     *

>> +     * [2] The reasons for this are that keeping the flow increases

>> +     *     (slightly) the cache footprint and increases computation

>> +     *     time as we move the packet around. Most importantly, the flow

>> +     *     should be updated by the actions and this can be slow, as

>> +     *     we use a sparse representation (miniflow).

>> +     *

>> +     */

>> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);

>> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {

>> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);

>> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {

>> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);

>> +    } else {

>> +        ok = false;

>> +    }

>> +

>> +    if (ok) {

>> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {

>> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);

>> +            return true;

>> +        }

>> +    }

>> +

>> +    return false;

>> +}

>> +?

>> +/* Symmetric */

>> +static uint32_t

>> +conn_key_hash(const struct conn_key *key, uint32_t basis)

>> +{

>> +    uint32_t hsrc, hdst, hash;

>> +    int i;

>> +

>> +    hsrc = hdst = basis;

>> +

>> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {

>> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);

>> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);

>> +    }

>> +

>> +    hash = hsrc ^ hdst;

>> +

>> +    hash = hash_words((uint32_t *) &key->dst + 1,

>> +                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),

>> +                      hash);

>> +

>> +    return hash;

>> +}

>> +

>> +static void

>> +conn_key_reverse(struct conn_key *key)

>> +{

>> +    struct ct_endpoint tmp;

>> +    tmp = key->src;

>> +    key->src = key->dst;

>> +    key->dst = tmp;

>> +}

>> +

>> +static void

>> +conn_key_lookup(struct conntrack *ct,

>> +                struct conn_lookup_ctx *ctx,

>> +                unsigned bucket,

>> +                long long now)

>> +{

>> +    struct conn *conn, *found = NULL;

>> +    uint32_t hash = ctx->hash;

>> +    bool reply;

>> +

>> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) {

>> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {

>> +            found = conn;

>> +            reply = false;

>> +            break;

>> +        }

>> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {

>> +            found = conn;

>> +            reply = true;

>> +            break;

>> +        }

>> +    }

>> +

>> +    if (found) {

>> +        if (conn_expired(found, now)) {

>> +            found = NULL;

>> +        } else {

>> +            ctx->reply = reply;

>> +        }

>> +    }

>> +

>> +    ctx->conn = found;

>> +}

>> +

>> +static enum ct_update_res

>> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,

>> +            long long now)

>> +{

>> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);

>> +}

>> +

>> +static bool

>> +conn_expired(struct conn *conn, long long now)

>> +{

>> +    return now >= conn->expiration;

>> +}

>> +

>> +static bool

>> +valid_new(struct dp_packet *pkt, struct conn_key *key)

>> +{

>> +    return l4_protos[key->nw_proto]->valid_new(pkt);

>> +}

>> +

>> +static struct conn *

>> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)

>> +{

>> +    struct conn *newconn;

>> +

>> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);

>> +

>> +    if (newconn) {

>> +        newconn->key = *key;

>> +    }

>> +

>> +    return newconn;

>> +}

>> +

>> +static void

>> +delete_conn(struct conn *conn)

>> +{

>> +    free(conn);

>> +}

>> diff --git a/lib/conntrack.h b/lib/conntrack.h

>> new file mode 100644

>> index 0000000..8561273

>> --- /dev/null

>> +++ b/lib/conntrack.h

>> @@ -0,0 +1,144 @@

>> +/*

>> + * Copyright (c) 2015, 2016 Nicira, Inc.

>> + *

>> + * Licensed under the Apache License, Version 2.0 (the "License");

>> + * you may not use this file except in compliance with the License.

>> + * You may obtain a copy of the License at:

>> + *

>> + *     http://www.apache.org/licenses/LICENSE-2.0

>> + *

>> + * Unless required by applicable law or agreed to in writing, software

>> + * distributed under the License is distributed on an "AS IS" BASIS,

>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

>> + * See the License for the specific language governing permissions and

>> + * limitations under the License.

>> + */

>> +

>> +#ifndef CONNTRACK_H

>> +#define CONNTRACK_H 1

>> +

>> +#include <stdbool.h>

>> +

>> +#include "hmap.h"

>> +#include "netdev-dpdk.h"

>> +#include "odp-netlink.h"

>> +#include "openvswitch/thread.h"

>> +#include "openvswitch/types.h"

>> +

>> +

>> +struct dp_packet;

>> +

>> +/* Userspace connection tracker

>> + * ============================

>> + *

>> + * This is a connection tracking module that keeps all the state in userspace.

>> + *

>> + * Usage

>> + * =====

>> + *

>> + *     struct conntract ct;

>> + *

>> + * Initialization:

>> + *

>> + *     conntrack_init(&ct);

>> + *

>> + * It is necessary to periodically issue a call to

>> + *

>> + *     conntrack_run(&ct);

>> + *

>> + * to allow the module to clean up expired connections.

>> + *

>> + * To send a group of packets through the connection tracker:

>> + *

>> + *     conntrack_execute(&ct, pkts, n_pkts, ...);

>> + *

>> + * Thread-safety

>> + * =============

>> + *

>> + * conntrack_execute() can be called by multiple threads simultaneoulsy.

>> + */

>> +

>> +struct conntrack;

>> +

>> +void conntrack_init(struct conntrack *);

>> +void conntrack_run(struct conntrack *);

>> +void conntrack_destroy(struct conntrack *);

>> +

>> +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,

>> +                      bool commit, uint16_t zone, const uint32_t *setmark,

>> +                      const struct ovs_key_ct_labels *setlabel,

>> +                      const char *helper);

>> +?

>> +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */

>> +

>> +#ifdef DPDK_NETDEV

>> +struct OVS_LOCKABLE ct_lock {

>> +    rte_spinlock_t lock;

>> +};

>> +

>> +static inline void ct_lock_init(struct ct_lock *lock)

>> +{

>> +    rte_spinlock_init(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_lock(struct ct_lock *lock) 

>

>There is an extra space above at the end.


Removed, thanks

>

>> +    OVS_ACQUIRES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    rte_spinlock_lock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_unlock(struct ct_lock *lock)

>> +    OVS_RELEASES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    rte_spinlock_unlock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)

>> +{

>> +}

>> +#else

>> +struct OVS_LOCKABLE ct_lock {

>> +    struct ovs_mutex lock;

>> +};

>> +

>> +static inline void ct_lock_init(struct ct_lock *lock)

>> +{

>> +    ovs_mutex_init(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_lock(struct ct_lock *lock)

>> +    OVS_ACQUIRES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    ovs_mutex_lock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_unlock(struct ct_lock *lock)

>> +    OVS_RELEASES(lock)

>> +    OVS_NO_THREAD_SAFETY_ANALYSIS

>> +{

>> +    ovs_mutex_unlock(&lock->lock);

>> +}

>> +

>> +static inline void ct_lock_destroy(struct ct_lock *lock)

>> +{

>> +    ovs_mutex_destroy(&lock->lock);

>> +}

>> +#endif

>> +?

>> +#define CONNTRACK_BUCKETS_SHIFT 8

>> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)

>> +

>> +struct conntrack {

>> +    /* Each lock guards a 'connections' bucket */

>> +    struct ct_lock locks[CONNTRACK_BUCKETS];

>> +    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;

>> +    uint32_t hash_basis;

>> +    unsigned purge_bucket;

>> +    uint32_t purge_inner_bucket;

>> +    uint32_t purge_inner_offset;

>> +};

>> +

>> +#endif /* conntrack.h */

>> -- 

>> 2.1.4

>> 

>> _______________________________________________

>> dev mailing list

>> dev@openvswitch.org

>> http://openvswitch.org/mailman/listinfo/dev

>

>-- 

>fbl
Fischetti, Antonio April 27, 2016, 10:03 a.m. UTC | #7
Hi Daniele, few comments inline.

> -----Original Message-----

> From: Daniele Di Proietto [mailto:diproiettod@vmware.com]

> Sent: Wednesday, April 27, 2016 7:36 AM

> To: Fischetti, Antonio <antonio.fischetti@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace

> connection tracker.

> 

> Hi Antonio,

> 

> Thank you for your comments, replies inline

> 

> 

> 

> 

> On 25/04/2016 09:14, "Fischetti, Antonio"

> <antonio.fischetti@intel.com> wrote:

> 

> >Hi Daniele,

> >some comments inline prefixed with [Antonio F].

> >

> >Regards,

> >Antonio

> >

> >> -----Original Message-----

> >> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of

> Daniele Di

> >> Proietto

> >> Sent: Saturday, April 16, 2016 1:03 AM

> >> To: dev@openvswitch.org

> >> Subject: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace

> connection

> >> tracker.

> >>

> >> This commit adds the conntrack module.

> >>

> >> It is a connection tracker that resides entirely in userspace.

> Its

> >> primary user will be the dpif-netdev datapath.

> >>

> >> The module main goal is to provide conntrack_execute(), which

> offers a

> >> convenient interface to implement the datapath ct() action.

> >>

> >> The conntrack module uses two submodules to deal with the l4

> protocol

> >> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

> >>

> >> The conntrack-tcp submodule implementation is adapted from

> FreeBSD's pf

> >> subsystem, therefore it's BSD licensed.  It has been slightly

> altered to

> >> match the OVS coding style and to allow the pickup of already

> >> established connections.

> >>

> >> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

> >> ---

> >>  COPYING                 |   1 +

> >>  debian/copyright.in     |   4 +

> >>  lib/automake.mk         |   5 +

> >>  lib/conntrack-other.c   |  91 ++++++

> >>  lib/conntrack-private.h |  77 +++++

> >>  lib/conntrack-tcp.c     | 476 +++++++++++++++++++++++++++

> >>  lib/conntrack.c         | 851

> >> ++++++++++++++++++++++++++++++++++++++++++++++++

> >>  lib/conntrack.h         | 144 ++++++++

> >>  8 files changed, 1649 insertions(+)

> >>  create mode 100644 lib/conntrack-other.c

> >>  create mode 100644 lib/conntrack-private.h

> >>  create mode 100644 lib/conntrack-tcp.c

> >>  create mode 100644 lib/conntrack.c

> >>  create mode 100644 lib/conntrack.h

> >>

> >> diff --git a/COPYING b/COPYING

> >> index 308e3ea..afb98b9 100644

> >> --- a/COPYING

> >> +++ b/COPYING

> >> @@ -25,6 +25,7 @@ License, version 2.

> >>  The following files are licensed under the 2-clause BSD license.

> >>      include/windows/getopt.h

> >>      lib/getopt_long.c

> >> +    lib/conntrack-tcp.c

> >>

> >>  The following files are licensed under the 3-clause BSD-license

> >>      include/windows/netinet/icmp6.h

> >> diff --git a/debian/copyright.in b/debian/copyright.in

> >> index 57d007a..a15f4dd 100644

> >> --- a/debian/copyright.in

> >> +++ b/debian/copyright.in

> >> @@ -21,6 +21,9 @@ Upstream Copyright Holders:

> >>  	Copyright (c) 2014 Michael Chapman

> >>  	Copyright (c) 2014 WindRiver, Inc.

> >>  	Copyright (c) 2014 Avaya, Inc.

> >> +	Copyright (c) 2001 Daniel Hartmeier

> >> +	Copyright (c) 2002 - 2008 Henning Brauer

> >> +	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

> >>

> >>  License:

> >>

> >> @@ -90,6 +93,7 @@ License:

> >>  	lib/getopt_long.c

> >>  	include/windows/getopt.h

> >>  	datapath-windows/ovsext/Conntrack-tcp.c

> >> +	lib/conntrack-tcp.c

> >>

> >>  * The following files are licensed under the 3-clause BSD-license

> >>

> >> diff --git a/lib/automake.mk b/lib/automake.mk

> >> index 1ec2115..ba30442 100644

> >> --- a/lib/automake.mk

> >> +++ b/lib/automake.mk

> >> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \

> >>  	lib/compiler.h \

> >>  	lib/connectivity.c \

> >>  	lib/connectivity.h \

> >> +	lib/conntrack-private.h \

> >> +	lib/conntrack-tcp.c \

> >> +	lib/conntrack-other.c \

> >> +	lib/conntrack.c \

> >> +	lib/conntrack.h \

> >>  	lib/coverage.c \

> >>  	lib/coverage.h \

> >>  	lib/crc32c.c \

> >> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c

> >> new file mode 100644

> >> index 0000000..65d02a9

> >> --- /dev/null

> >> +++ b/lib/conntrack-other.c

> >> @@ -0,0 +1,91 @@

> >> +/*

> >> + * Copyright (c) 2015, 2016 Nicira, Inc.

> >> + *

> >> + * Licensed under the Apache License, Version 2.0 (the

> "License");

> >> + * you may not use this file except in compliance with the

> License.

> >> + * You may obtain a copy of the License at:

> >> + *

> >> + *     http://www.apache.org/licenses/LICENSE-2.0

> >> + *

> >> + * Unless required by applicable law or agreed to in writing,

> software

> >> + * distributed under the License is distributed on an "AS IS"

> BASIS,

> >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> >> implied.

> >> + * See the License for the specific language governing

> permissions and

> >> + * limitations under the License.

> >> + */

> >> +

> >> +#include <config.h>

> >> +

> >> +#include "conntrack-private.h"

> >> +#include "dp-packet.h"

> >> +

> >> +enum other_state {

> >> +    OTHERS_FIRST,

> >> +    OTHERS_MULTIPLE,

> >> +    OTHERS_BIDIR,

> >> +};

> >> +

> >> +struct conn_other {

> >> +    struct conn up;

> >> +    enum other_state state;

> >> +};

> >> +

> >> +static const long long other_timeouts[] = {

> >> +    [OTHERS_FIRST] = 60 * 1000,

> >> +    [OTHERS_MULTIPLE] = 60 * 1000,

> >> +    [OTHERS_BIDIR] = 30 * 1000,

> >> +};

> >> +

> >> +static struct conn_other *

> >> +conn_other_cast(const struct conn *conn)

> >> +{

> >> +    return CONTAINER_OF(conn, struct conn_other, up);

> >> +}

> >> +

> >> +static void

> >> +update_expiration(struct conn_other *conn, long long now)

> >> +{

> >> +    conn->up.expiration = now + other_timeouts[conn->state];

> >> +}

> >> +

> >> +static enum ct_update_res

> >> +other_conn_update(struct conn *conn_, struct dp_packet *pkt

> >> OVS_UNUSED,

> >> +                bool reply, long long now)

> >> +{

> >> +    struct conn_other *conn = conn_other_cast(conn_);

> >> +

> >> +    if (reply && conn->state != OTHERS_BIDIR) {

> >> +        conn->state = OTHERS_BIDIR;

> >> +    } else if (conn->state == OTHERS_FIRST) {

> >> +        conn->state = OTHERS_MULTIPLE;

> >> +    }

> >> +

> >> +    update_expiration(conn, now);

> >> +

> >> +    return CT_UPDATE_VALID;

> >> +}

> >> +

> >> +static bool

> >> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)

> >> +{

> >> +    return true;

> >> +}

> >> +

> >> +static struct conn *

> >> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)

> >> +{

> >> +    struct conn_other *conn;

> >> +

> >> +    conn = xzalloc(sizeof(struct conn_other));

> >> +    conn->state = OTHERS_FIRST;

> >> +

> >> +    update_expiration(conn, now);

> >> +

> >> +    return &conn->up;

> >> +}

> >> +

> >> +struct ct_l4_proto ct_proto_other = {

> >> +    .new_conn = other_new_conn,

> >> +    .valid_new = other_valid_new,

> >> +    .conn_update = other_conn_update,

> >> +};

> >> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

> >> new file mode 100644

> >> index 0000000..e668c44

> >> --- /dev/null

> >> +++ b/lib/conntrack-private.h

> >> @@ -0,0 +1,77 @@

> >> +/*

> >> + * Copyright (c) 2015, 2016 Nicira, Inc.

> >> + *

> >> + * Licensed under the Apache License, Version 2.0 (the

> "License");

> >> + * you may not use this file except in compliance with the

> License.

> >> + * You may obtain a copy of the License at:

> >> + *

> >> + *     http://www.apache.org/licenses/LICENSE-2.0

> >> + *

> >> + * Unless required by applicable law or agreed to in writing,

> software

> >> + * distributed under the License is distributed on an "AS IS"

> BASIS,

> >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> >> implied.

> >> + * See the License for the specific language governing

> permissions and

> >> + * limitations under the License.

> >> + */

> >> +

> >> +#ifndef CONNTRACK_PRIVATE_H

> >> +#define CONNTRACK_PRIVATE_H 1

> >> +

> >> +#include <sys/types.h>

> >> +#include <netinet/in.h>

> >> +#include <netinet/ip6.h>

> >> +

> >> +#include "hmap.h"

> >> +#include "openvswitch/types.h"

> >> +#include "packets.h"

> >> +#include "unaligned.h"

> >> +

> >> +struct ct_addr {

> >> +    union {

> >> +        ovs_16aligned_be32 ipv4;

> >> +        union ovs_16aligned_in6_addr ipv6;

> >> +        ovs_be32 ipv4_aligned;

> >> +        struct in6_addr ipv6_aligned;

> >> +    };

> >> +};

> >> +

> >> +struct ct_endpoint {

> >> +    struct ct_addr addr;

> >> +    ovs_be16 port;

> >> +};

> >> +

> >> +struct conn_key {

> >> +    struct ct_endpoint src;

> >> +    struct ct_endpoint dst;

> >> +

> >> +    ovs_be16 dl_type;

> >> +    uint8_t nw_proto;

> >> +    uint16_t zone;

> >> +};

> >> +

> >> +struct conn {

> >> +    struct conn_key key;

> >> +    struct conn_key rev_key;

> >> +    long long expiration;

> >> +    struct hmap_node node;

> >> +    uint32_t mark;

> >> +    ovs_u128 label;

> >> +};

> >> +

> >> +enum ct_update_res {

> >> +    CT_UPDATE_INVALID,

> >> +    CT_UPDATE_VALID,

> >> +    CT_UPDATE_NEW,

> >> +};

> >> +

> >> +struct ct_l4_proto {

> >> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long

> now);

> >> +    bool (*valid_new)(struct dp_packet *pkt);

> >> +    enum ct_update_res (*conn_update)(struct conn *conn, struct

> dp_packet

> >> *pkt,

> >> +                                      bool reply, long long now);

> >> +};

> >> +

> >> +extern struct ct_l4_proto ct_proto_tcp;

> >> +extern struct ct_l4_proto ct_proto_other;

> >> +

> >> +#endif /* conntrack-private.h */

> >> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c

> >> new file mode 100644

> >> index 0000000..4d80038

> >> --- /dev/null

> >> +++ b/lib/conntrack-tcp.c

> >> @@ -0,0 +1,476 @@

> >> +/*-

> >> + * Copyright (c) 2001 Daniel Hartmeier

> >> + * Copyright (c) 2002 - 2008 Henning Brauer

> >> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>

> >> + * Copyright (c) 2015, 2016 Nicira, Inc.

> >> + * All rights reserved.

> >> + *

> >> + * Redistribution and use in source and binary forms, with or

> without

> >> + * modification, are permitted provided that the following

> conditions

> >> + * are met:

> >> + *

> >> + *    - Redistributions of source code must retain the above

> copyright

> >> + *      notice, this list of conditions and the following

> disclaimer.

> >> + *    - Redistributions in binary form must reproduce the above

> >> + *      copyright notice, this list of conditions and the

> following

> >> + *      disclaimer in the documentation and/or other materials

> provided

> >> + *      with the distribution.

> >> + *

> >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND

> >> CONTRIBUTORS

> >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT

> NOT

> >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

> >> FITNESS

> >> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

> >> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,

> >> INDIRECT,

> >> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

> >> (INCLUDING,

> >> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

> >> SERVICES;

> >> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

> HOWEVER

> >> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

> >> STRICT

> >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING

> IN

> >> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF

> THE

> >> + * POSSIBILITY OF SUCH DAMAGE.

> >> + *

> >> + * Effort sponsored in part by the Defense Advanced Research

> Projects

> >> + * Agency (DARPA) and Air Force Research Laboratory, Air Force

> >> + * Materiel Command, USAF, under agreement number F30602-01-2-

> 0537.

> >> + *

> >> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $

> >> + */

> >> +

> >> +#include <config.h>

> >> +

> >> +#include "conntrack-private.h"

> >> +#include "ct-dpif.h"

> >> +#include "dp-packet.h"

> >> +#include "util.h"

> >> +

> >> +struct tcp_peer {

> >> +    enum ct_dpif_tcp_state state;

> >> +    uint32_t               seqlo;          /* Max sequence number

> sent     */

> >> +    uint32_t               seqhi;          /* Max the other end

> ACKd + win */

> >> +    uint16_t               max_win;        /* largest window (pre

> scaling) */

> >> +    uint8_t                wscale;         /* window scaling

> factor        */

> >> +};

> >> +

> >> +struct conn_tcp {

> >> +    struct conn up;

> >> +    struct tcp_peer peer[2];

> >> +};

> >> +

> >> +enum {

> >> +    TCPOPT_EOL,

> >> +    TCPOPT_NOP,

> >> +    TCPOPT_WINDOW = 3,

> >> +};

> >> +

> >> +/* TCP sequence numbers are 32 bit integers operated

> >> + * on with modular arithmetic.  These macros can be

> >> + * used to compare such integers. */

> >> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)

> >> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)

> >> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)

> >> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)

> >> +

> >> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))

> >> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))

> >> +

> >> +static struct conn_tcp*

> >> +conn_tcp_cast(const struct conn* conn)

> >> +{

> >> +    return CONTAINER_OF(conn, struct conn_tcp, up);

> >> +}

> >> +

> >> +/* pf does this in in pf_normalize_tcp(), and it is called only

> if scrub

> >> + * is enabled.  We're not scrubbing, but this check seems

> reasonable.  */

> >> +static bool

> >> +tcp_invalid_flags(uint16_t flags)

> >> +{

> >> +

> >> +    if (flags & TCP_SYN) {

> >> +        if (flags & TCP_RST) {

> >> +            return true;

> >> +        }

> >> +        if (flags & TCP_FIN) {

> >> +            /* Here pf removes the fin flag.  We simply mark the

> packet as

> >> +             * invalid */

> >> +            return true;

> >> +        }

> >> +    } else {

> >> +        /* Illegal packet */

> >> +        if (!(flags & (TCP_ACK|TCP_RST))) {

> >> +            return true;

> >> +        }

> >> +    }

> >> +

> >> +    if (!(flags & TCP_ACK)) {

> >> +        /* These flags are only valid if ACK is set */

> >> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags &

> TCP_URG)) {

> >> +            return true;

> >> +        }

> >> +    }

> >> +

> >> +    return false;

> >> +}

> >> +

> >> +#define TCP_MAX_WSCALE 14

> >> +#define CT_WSCALE_FLAG 0x80

> >> +#define CT_WSCALE_UNKNOWN 0x40

> >> +#define CT_WSCALE_MASK 0xf

> >> +

> >> +static uint8_t

> >> +tcp_get_wscale(const struct tcp_header *tcp)

> >> +{

> >> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;

> >> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);

> >> +    uint8_t wscale = 0;

> >> +    uint8_t optlen;

> >> +

> >> +    while (len >= 3) {

> >> +        if (*opt == TCPOPT_EOL) {

> >> +            break;

> >> +        }

> >> +        switch (*opt) {

> >> +        case TCPOPT_NOP:

> >> +            opt++;

> >> +            len--;

> >> +            break;

> >> +        case TCPOPT_WINDOW:

> >> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);

> >> +            wscale |= CT_WSCALE_FLAG;

> >> +            /* fall through */

> >> +        default:

> >> +            optlen = opt[1];

> >> +            if (optlen < 2) {

> >> +                optlen = 2;

> >> +            }

> >> +            len -= optlen;

> >> +            opt += optlen;

> >> +        }

> >> +    }

> >> +

> >> +    return wscale;

> >> +}

> >> +

> >> +static uint32_t

> >> +tcp_payload_length(struct dp_packet *pkt)

> >> +{

> >> +    return (char *) dp_packet_tail(pkt) -

> dp_packet_l2_pad_size(pkt)

> >> +           - (char *) dp_packet_get_tcp_payload(pkt);

> >> +}

> >> +

> >> +static void

> >> +update_expiration(struct conn_tcp *conn, long long now, long long

> interval)

> >> +{

> >> +    conn->up.expiration = now + interval;

> >> +}

> >> +

> >> +

> >> +static enum ct_update_res

> >> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool

> reply,

> >> +                long long now)

> >> +{

> >> +    struct conn_tcp *conn = conn_tcp_cast(conn_);

> >> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> >> +    /* The peer that sent 'pkt' */

> >> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];

> >> +    /* The peer that should receive 'pkt' */

> >> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];

> >> +    uint8_t sws = 0, dws = 0;

> >> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> >> +

> >> +    uint16_t win = ntohs(tcp->tcp_winsz);

> >> +    uint32_t ack, end, seq, orig_seq;

> >> +    uint32_t p_len = tcp_payload_length(pkt);

> >> +    int ackskew;

> >> +

> >> +    if (tcp_invalid_flags(tcp_flags)) {

> >> +        return CT_UPDATE_INVALID;

> >> +    }

> >> +

> >> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&

> >> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&

> >> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

> >> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;

> >> +        return CT_UPDATE_NEW;

> >> +    }

> >> +

> >> +    if (src->wscale & CT_WSCALE_FLAG

> >> +        && dst->wscale & CT_WSCALE_FLAG

> >> +        && !(tcp_flags & TCP_SYN)) {

> >> +

> >> +        sws = src->wscale & CT_WSCALE_MASK;

> >> +        dws = dst->wscale & CT_WSCALE_MASK;

> >> +

> >> +    } else if (src->wscale & CT_WSCALE_UNKNOWN

> >> +        && dst->wscale & CT_WSCALE_UNKNOWN

> >> +        && !(tcp_flags & TCP_SYN)) {

> >> +

> >> +        sws = TCP_MAX_WSCALE;

> >> +        dws = TCP_MAX_WSCALE;

> >> +    }

> >> +

> >> +    /*

> >> +     * Sequence tracking algorithm from Guido van Rooij's paper:

> >> +     *   http://www.madison-

> gurkha.com/publications/tcp_filtering/

> >> +     *      tcp_filtering.ps

> >> +     */

> >> +

> >> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));

> >> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {

> >> +        /* First packet from this end. Set its state */

> >> +

> >> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

> >> +

> >> +        end = seq + p_len;

> >> +        if (tcp_flags & TCP_SYN) {

> >> +            end++;

> >> +            if (dst->wscale & CT_WSCALE_FLAG) {

> >> +                src->wscale = tcp_get_wscale(tcp);

> >> +                if (src->wscale & CT_WSCALE_FLAG) {

> >> +                    /* Remove scale factor from initial window */

> >> +                    sws = src->wscale & CT_WSCALE_MASK;

> >> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);

> >> +                    dws = dst->wscale & CT_WSCALE_MASK;

> >> +                } else {

> >> +                    /* fixup other window */

> >> +                    dst->max_win <<= dst->wscale &

> >> +                        CT_WSCALE_MASK;

> >> +                    /* in case of a retrans SYN|ACK */

> >> +                    dst->wscale = 0;

> >> +                }

> >> +            }

> >> +        }

> >> +        if (tcp_flags & TCP_FIN) {

> >> +            end++;

> >> +        }

> >> +

> >> +        src->seqlo = seq;

> >> +        src->state = CT_DPIF_TCPS_SYN_SENT;

> >> +        /*

> >> +         * May need to slide the window (seqhi may have been set

> by

> >> +         * the crappy stack check or if we picked up the

> connection

> >> +         * after establishment)

> >> +         */

> >> +        if (src->seqhi == 1 ||

> >> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src-

> >seqhi)) {

> >> +            src->seqhi = end + MAX(1, dst->max_win << dws);

> >> +        }

> >> +        if (win > src->max_win) {

> >> +            src->max_win = win;

> >> +        }

> >> +

> >> +    } else {

> >> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));

> >> +        end = seq + p_len;

> >> +        if (tcp_flags & TCP_SYN) {

> >> +            end++;

> >> +        }

> >> +        if (tcp_flags & TCP_FIN) {

> >> +            end++;

> >> +        }

> >> +    }

> >> +

> >> +    if ((tcp_flags & TCP_ACK) == 0) {

> >> +        /* Let it pass through the ack skew check */

> >> +        ack = dst->seqlo;

> >> +    } else if ((ack == 0

> >> +                && (tcp_flags & (TCP_ACK|TCP_RST)) ==

> (TCP_ACK|TCP_RST))

> >> +               /* broken tcp stacks do not set ack */) {

> >> +        /* Many stacks (ours included) will set the ACK number in

> an

> >> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */

> >> +        ack = dst->seqlo;

> >> +    }

> >> +

> >> +    if (seq == end) {

> >> +        /* Ease sequencing restrictions on no data packets */

> >> +        seq = src->seqlo;

> >> +        end = seq;

> >> +    }

> >> +

> >> +    ackskew = dst->seqlo - ack;

> >> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary

> fudge

> >> factor */

> >> +    if (SEQ_GEQ(src->seqhi, end)

> >> +        /* Last octet inside other's window space */

> >> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))

> >> +        /* Retrans: not more than one window back */

> >> +        && (ackskew >= -MAXACKWINDOW)

> >> +        /* Acking not more than one reassembled fragment

> backwards */

> >> +        && (ackskew <= (MAXACKWINDOW << sws))

> >> +        /* Acking not more than one window forward */

> >> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo

> >> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 ==

> src->seqlo))) {

> >> +        /* Require an exact/+1 sequence match on resets when

> possible */

> >> +

> >> +        /* update max window */

> >> +        if (src->max_win < win) {

> >> +            src->max_win = win;

> >> +        }

> >> +        /* synchronize sequencing */

> >> +        if (SEQ_GT(end, src->seqlo)) {

> >> +            src->seqlo = end;

> >> +        }

> >> +        /* slide the window of what the other end can send */

> >> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

> >> +            dst->seqhi = ack + MAX((win << sws), 1);

> >> +        }

> >> +

> >> +        /* update states */

> >> +        if (tcp_flags & TCP_SYN && src->state <

> CT_DPIF_TCPS_SYN_SENT) {

> >> +                src->state = CT_DPIF_TCPS_SYN_SENT;

> >> +        }

> >> +        if (tcp_flags & TCP_FIN && src->state <

> CT_DPIF_TCPS_CLOSING) {

> >> +                src->state = CT_DPIF_TCPS_CLOSING;

> >> +        }

> >> +        if (tcp_flags & TCP_ACK) {

> >> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {

> >> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;

> >> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {

> >> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;

> >> +            }

> >> +        }

> >> +        if (tcp_flags & TCP_RST) {

> >> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

> >> +        }

> >> +

> >> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2

> >> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {

> >> +            update_expiration(conn, now, 30 * 1000);

> >> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

> >> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {

> >> +            update_expiration(conn, now, 45 * 1000);

> >> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED

> >> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {

> >> +            update_expiration(conn, now, 30 * 1000);

> >> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING

> >> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {

> >> +            update_expiration(conn, now, 15 * 60 * 1000);

> >> +        } else {

> >> +            update_expiration(conn, now, 24 * 60 * 60 * 1000);

> >> +        }

> >> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT

> >> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2

> >> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)

> >> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)

> >> +               /* Within a window forward of the originating

> packet */

> >> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {

> >> +               /* Within a window backward of the originating

> packet */

> >> +

> >> +        /*

> >> +         * This currently handles three situations:

> >> +         *  1) Stupid stacks will shotgun SYNs before their peer

> >> +         *     replies.

> >> +         *  2) When PF catches an already established stream (the

> >> +         *     firewall rebooted, the state table was flushed,

> routes

> >> +         *     changed...)

> >> +         *  3) Packets get funky immediately after the connection

> >> +         *     closes (this should catch Solaris spurious

> ACK|FINs

> >> +         *     that web servers like to spew after a close)

> >> +         *

> >> +         * This must be a little more careful than the above code

> >> +         * since packet floods will also be caught here. We don't

> >> +         * update the TTL here to mitigate the damage of a packet

> >> +         * flood and so the same code can handle awkward

> establishment

> >> +         * and a loosened connection close.

> >> +         * In the establishment case, a correct peer response

> will

> >> +         * validate the connection, go through the normal state

> code

> >> +         * and keep updating the state TTL.

> >> +         */

> >> +

> >> +        /* update max window */

> >> +        if (src->max_win < win) {

> >> +            src->max_win = win;

> >> +        }

> >> +        /* synchronize sequencing */

> >> +        if (SEQ_GT(end, src->seqlo)) {

> >> +            src->seqlo = end;

> >> +        }

> >> +        /* slide the window of what the other end can send */

> >> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {

> >> +            dst->seqhi = ack + MAX((win << sws), 1);

> >> +        }

> >> +

> >> +        /*

> >> +         * Cannot set dst->seqhi here since this could be a

> shotgunned

> >> +         * SYN and not an already established connection.

> >> +         */

> >> +

> >> +        if (tcp_flags & TCP_FIN && src->state <

> CT_DPIF_TCPS_CLOSING) {

> >> +            src->state = CT_DPIF_TCPS_CLOSING;

> >> +        }

> >> +

> >> +        if (tcp_flags & TCP_RST) {

> >> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;

> >> +        }

> >> +    } else {

> >> +        return CT_UPDATE_INVALID;

> >> +    }

> >> +

> >> +    return CT_UPDATE_VALID;

> >> +}

> >> +

> >> +static bool

> >> +tcp_valid_new(struct dp_packet *pkt)

> >> +{

> >> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> >> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> >> +

> >> +    if (tcp_invalid_flags(tcp_flags)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    /* A syn+ack is not allowed to create a connection.  We want

> to allow

> >> +     * totally new connections (syn) or already established, not

> partially

> >> +     * open (syn+ack). */

> >> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +static struct conn *

> >> +tcp_new_conn(struct dp_packet *pkt, long long now)

> >> +{

> >> +    struct conn_tcp* newconn = NULL;

> >> +    struct tcp_header *tcp = dp_packet_l4(pkt);

> >> +    struct tcp_peer *src, *dst;

> >> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);

> >> +

> >> +    newconn = xzalloc(sizeof(struct conn_tcp));

> >> +

> >> +    src = &newconn->peer[0];

> >> +    dst = &newconn->peer[1];

> >> +

> >> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));

> >> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;

> >> +

> >> +    if (tcp_flags & TCP_SYN) {

> >> +        src->seqhi++;

> >> +        src->wscale = tcp_get_wscale(tcp);

> >> +    } else {

> >> +        src->wscale = CT_WSCALE_UNKNOWN;

> >> +        dst->wscale = CT_WSCALE_UNKNOWN;

> >> +    }

> >> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);

> >> +    if (src->wscale & CT_WSCALE_MASK) {

> >> +        /* Remove scale factor from initial window */

> >> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;

> >> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 <<

> sws);

> >> +    }

> >> +    if (tcp_flags & TCP_FIN) {

> >> +        src->seqhi++;

> >> +    }

> >> +    dst->seqhi = 1;

> >> +    dst->max_win = 1;

> >> +    src->state = CT_DPIF_TCPS_SYN_SENT;

> >> +    dst->state = CT_DPIF_TCPS_CLOSED;

> >> +

> >> +    update_expiration(newconn, now, 30 * 1000);

> >> +

> >> +    return &newconn->up;

> >> +}

> >> +

> >> +struct ct_l4_proto ct_proto_tcp = {

> >> +    .new_conn = tcp_new_conn,

> >> +    .valid_new = tcp_valid_new,

> >> +    .conn_update = tcp_conn_update,

> >> +};

> >> diff --git a/lib/conntrack.c b/lib/conntrack.c

> >> new file mode 100644

> >> index 0000000..840335b

> >> --- /dev/null

> >> +++ b/lib/conntrack.c

> >> @@ -0,0 +1,851 @@

> >> +/*

> >> + * Copyright (c) 2015, 2016 Nicira, Inc.

> >> + *

> >> + * Licensed under the Apache License, Version 2.0 (the

> "License");

> >> + * you may not use this file except in compliance with the

> License.

> >> + * You may obtain a copy of the License at:

> >> + *

> >> + *     http://www.apache.org/licenses/LICENSE-2.0

> >> + *

> >> + * Unless required by applicable law or agreed to in writing,

> software

> >> + * distributed under the License is distributed on an "AS IS"

> BASIS,

> >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> >> implied.

> >> + * See the License for the specific language governing

> permissions and

> >> + * limitations under the License.

> >> + */

> >> +

> >> +#include <config.h>

> >> +#include "conntrack.h"

> >> +

> >> +#include <errno.h>

> >> +#include <sys/types.h>

> >> +#include <netinet/in.h>

> >> +#include <netinet/icmp6.h>

> >> +

> >> +#include "bitmap.h"

> >> +#include "conntrack-private.h"

> >> +#include "dp-packet.h"

> >> +#include "flow.h"

> >> +#include "hmap.h"

> >> +#include "netdev.h"

> >> +#include "odp-netlink.h"

> >> +#include "openvswitch/vlog.h"

> >> +#include "ovs-rcu.h"

> >> +#include "random.h"

> >> +#include "timeval.h"

> >> +

> >> +VLOG_DEFINE_THIS_MODULE(conntrack);

> >> +

> >> +struct conn_lookup_ctx {

> >> +    struct conn_key key;

> >> +    struct conn *conn;

> >> +    uint32_t hash;

> >> +    bool reply;

> >> +    bool related;

> >> +};

> >> +

> >> +static bool conn_key_extract(struct conntrack *, struct dp_packet

> *,

> >> +                             struct conn_lookup_ctx *, uint16_t

> zone);

> >> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t

> basis);

> >> +static void conn_key_reverse(struct conn_key *);

> >> +static void conn_key_lookup(struct conntrack *ct,

> >> +                            struct conn_lookup_ctx *ctx,

> >> +                            unsigned bucket,

> >> +                            long long now);

> >> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);

> >> +static struct conn *new_conn(struct dp_packet *pkt, struct

> conn_key *,

> >> +                             long long now);

> >> +static void delete_conn(struct conn *);

> >> +static enum ct_update_res conn_update(struct conn *, struct

> dp_packet*,

> >> +                                      bool reply, long long now);

> >> +static bool conn_expired(struct conn *, long long now);

> >> +static void set_mark(struct dp_packet *, struct conn *,

> >> +                     uint32_t val, uint32_t mask);

> >> +static void set_label(struct dp_packet *, struct conn *,

> >> +                      const struct ovs_key_ct_labels *val,

> >> +                      const struct ovs_key_ct_labels *mask);

> >> +

> >> +static struct ct_l4_proto *l4_protos[] = {

> >> +    [IPPROTO_TCP] = &ct_proto_tcp,

> >> +    [IPPROTO_UDP] = &ct_proto_other,

> >> +    [IPPROTO_ICMP] = &ct_proto_other,

> >> +};

> >> +

> >> +/* Initializes the connection tracker 'ct'.  The caller is

> responbile for

> >> + * calling 'conntrack_destroy()', when the instance is not needed

> anymore */

> >> +void

> >> +conntrack_init(struct conntrack *ct)

> >> +{

> >> +    unsigned i;

> >> +

> >> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

> >> +        ct_lock_init(&ct->locks[i]);

> >> +        ct_lock_lock(&ct->locks[i]);

> >> +        hmap_init(&ct->connections[i]);

> >> +        ct_lock_unlock(&ct->locks[i]);

> >> +    }

> >> +    ct->hash_basis = random_uint32();

> >> +    ct->purge_bucket = 0;

> >> +    ct->purge_inner_bucket = 0;

> >> +    ct->purge_inner_offset = 0;

> >> +}

> >> +

> >> +/* Destroys the connection tracker 'ct' and frees all the

> allocated memory. */

> >> +void

> >> +conntrack_destroy(struct conntrack *ct)

> >> +{

> >> +    unsigned i;

> >> +

> >> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {

> >> +        struct conn *conn, *next;

> >> +

> >> +        ct_lock_lock(&ct->locks[i]);

> >> +        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i])

> {

> >> +            hmap_remove(&ct->connections[i], &conn->node);

> >> +            delete_conn(conn);

> >> +        }

> >> +        hmap_destroy(&ct->connections[i]);

> >> +        ct_lock_unlock(&ct->locks[i]);

> >> +        ct_lock_destroy(&ct->locks[i]);

> >> +    }

> >> +}

> >> +

> >

> >> +static unsigned hash_to_bucket(uint32_t hash)

> >> +{

> >> +    /* Extracts the most significant bits in hash. The least

> significant bits

> >> +     * are already used internally by the hmap implementation. */

> >> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 &&

> >> CONNTRACK_BUCKETS_SHIFT >= 1);

> >> +

> >> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) %

> >> CONNTRACK_BUCKETS;

> >> +}

> >> +

> >> +static void

> >> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,

> >> +            uint32_t mark, ovs_u128 label)

> >> +{

> >> +    pkt->md.ct_state = state | CS_TRACKED;

> >> +    pkt->md.ct_zone = zone;

> >> +    pkt->md.ct_mark = mark;

> >> +    pkt->md.ct_label = label;

> >> +}

> >> +

> >> +static struct conn *

> >> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

> >> +               struct conn_lookup_ctx *ctx, uint8_t *state, bool

> commit,

> >> +               long long now)

> >> +{

> >> +    unsigned bucket = hash_to_bucket(ctx->hash);

> >> +    struct conn *nc = NULL;

> >> +

> >> +    if (!valid_new(pkt, &ctx->key)) {

> >> +        *state |= CS_INVALID;

> >> +        return nc;

> >> +    }

> >> +

> >> +    *state |= CS_NEW;

> >> +

> >> +    if (commit) {

> >> +        nc = new_conn(pkt, &ctx->key, now);

> >> +

> >> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);

> >> +

> >> +        conn_key_reverse(&nc->rev_key);

> >> +        hmap_insert(&ct->connections[bucket], &nc->node, ctx-

> >hash);

> >> +    }

> >> +

> >> +    return nc;

> >> +}

> >> +

> >> +static struct conn *

> >> +process_one(struct conntrack *ct, struct dp_packet *pkt,

> >> +            struct conn_lookup_ctx *ctx, uint16_t zone,

> >> +            bool commit, long long now)

> >> +{

> >> +    unsigned bucket = hash_to_bucket(ctx->hash);

> >> +    struct conn *conn = ctx->conn;

> >> +    uint8_t state = 0;

> >> +

> >> +    if (conn) {

> >> +        if (ctx->related) {

> >> +            state |= CS_RELATED;

> >> +            if (ctx->reply) {

> >> +                state |= CS_REPLY_DIR;

> >> +            }

> >> +        } else {

> >> +            enum ct_update_res res;

> >> +

> >> +            res = conn_update(conn, pkt, ctx->reply, now);

> >> +

> >> +            switch (res) {

> >> +            case CT_UPDATE_VALID:

> >> +                state |= CS_ESTABLISHED;

> >> +                if (ctx->reply) {

> >> +                    state |= CS_REPLY_DIR;

> >> +                }

> >> +                break;

> >> +            case CT_UPDATE_INVALID:

> >> +                state |= CS_INVALID;

> >> +                break;

> >> +            case CT_UPDATE_NEW:

> >> +                hmap_remove(&ct->connections[bucket], &conn-

> >node);

> >> +                delete_conn(conn);

> >> +                conn = conn_not_found(ct, pkt, ctx, &state,

> commit, now);

> >> +                break;

> >

> >[Antonio F]

> >I see that conn_update can return just CT_UPDATE_VALID/INVALID/NEW

> >but should we consider a 'default' case here, eg

> >default:

> >    state |= CS_INVALID;

> >    break;

> >?

> 

> I'm not too worried about that because if we add another value to the

> enum without explicitly handling it here we will get a warning.

> 

> Is there a particular case you're worried about?


[Antonio F] 
No, there's no particular case. It's just for a safer approach 
to handle any possible situation, future changes and so on.


> 

> >

> >

> >> +            }

> >> +        }

> >> +

> >> +        pkt->md.ct_label = conn->label;

> >> +        pkt->md.ct_mark = conn->mark;

> >

> >[Antonio F]

> >Can we skip the previous 2 lines as they are implemented into

> write_ct_md ?

> 

> Yes we can, thank you for spotting this!


[Antonio F] np


> 

> >

> >

> >> +        write_ct_md(pkt, state, zone, conn->mark, conn->label);

> >> +    } else {

> >> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);

> >> +        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});

> >> +    }

> >> +

> >> +    return conn;

> >> +}

> >> +

> >> +/* Sends a group of 'cnt' packets ('pkts') through the connection

> tracker

> >> + * 'ct'.  If 'commit' is true, the packets are allowed to create

> new entries

> >> + * in the connection tables.  'setmark', if not NULL, should

> point to a two

> >> + * elements array containing a value and a mask to set the

> connection mark.

> >> + * 'setlabel' behaves similarly for the connection label.*/

> >> +int

> >> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts,

> size_t cnt,

> >> +                  bool commit, uint16_t zone, const uint32_t

> *setmark,

> >> +                  const struct ovs_key_ct_labels *setlabel,

> >> +                  const char *helper)

> >> +{

> >> +#if !defined(__CHECKER__) && !defined(_WIN32)

> >> +    const size_t KEY_ARRAY_SIZE = cnt;

> >> +#else

> >> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };

> >> +#endif

> >> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];

> >> +    int8_t bucket_list[CONNTRACK_BUCKETS];

> >> +    struct {

> >> +        unsigned bucket;

> >> +        unsigned long maps;

> >> +    } arr[KEY_ARRAY_SIZE];

> >> +    long long now = time_msec();

> >> +    size_t i = 0;

> >> +    uint8_t arrcnt = 0;

> >> +

> >> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >=

> >> NETDEV_MAX_BURST);

> >> +

> >> +    if (helper) {

> >> +        static struct vlog_rate_limit rl =

> VLOG_RATE_LIMIT_INIT(5, 5);

> >> +

> >> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported",

> helper);

> >> +        /* Continue without the helper */

> >> +    }

> >> +

> >> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);

> >> +    for (i = 0; i < cnt; i++) {

> >> +        unsigned bucket;

> >> +

> >> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {

> >> +            write_ct_md(pkts[i], CS_INVALID, zone, 0,

> (ovs_u128){{0}});

> >> +            continue;

> >> +        }

> >> +

> >> +        bucket = hash_to_bucket(ctxs[i].hash);

> >> +        if (bucket_list[bucket] == INT8_C(-1)) {

> >> +            bucket_list[bucket] = arrcnt;

> >> +

> >> +            arr[arrcnt].maps = 0;

> >> +            ULLONG_SET1(arr[arrcnt].maps, i);

> >> +            arr[arrcnt++].bucket = bucket;

> >> +        } else {

> >> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);

> >> +            arr[bucket_list[bucket]].maps |= 1UL << i;

> >> +        }

> >> +    }

> >> +

> >> +    for (i = 0; i < arrcnt; i++) {

> >> +        size_t j;

> >> +

> >> +        ct_lock_lock(&ct->locks[arr[i].bucket]);

> >> +

> >> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {

> >> +            struct conn *conn;

> >> +

> >> +            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);

> >> +

> >> +            conn = process_one(ct, pkts[j], &ctxs[j], zone,

> commit, now);

> >> +

> >> +            if (conn && setmark) {

> >> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);

> >> +            }

> >> +

> >> +            if (conn && setlabel) {

> >> +                set_label(pkts[j], conn, &setlabel[0],

> &setlabel[1]);

> >> +            }

> >> +        }

> >> +        ct_lock_unlock(&ct->locks[arr[i].bucket]);

> >> +    }

> >> +

> >> +    return 0;

> >> +}

> >> +

> >> +static void

> >> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val,

> uint32_t

> >> mask)

> >> +{

> >> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));

> >> +    conn->mark = pkt->md.ct_mark;

> >> +}

> >> +

> >> +static void

> >> +set_label(struct dp_packet *pkt, struct conn *conn,

> >> +          const struct ovs_key_ct_labels *val,

> >> +          const struct ovs_key_ct_labels *mask)

> >> +{

> >> +    ovs_u128 v, m;

> >> +

> >> +    memcpy(&v, val, sizeof v);

> >> +    memcpy(&m, mask, sizeof m);

> >> +

> >> +    pkt->md.ct_label.u64.lo = v.u64.lo

> >> +                              | (pkt->md.ct_label.u64.lo &

> ~(m.u64.lo));

> >> +    pkt->md.ct_label.u64.hi = v.u64.hi

> >> +                              | (pkt->md.ct_label.u64.hi &

> ~(m.u64.hi));

> >> +    conn->label = pkt->md.ct_label;

> >> +}

> >> +

> >

> >> +#define CONNTRACK_PURGE_NUM 256

> >> +

> >> +static void

> >> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,

> >> +             uint32_t *inner_offset, unsigned *left, long long

> now)

> >> +{

> >> +    while (*left != 0) {

> >> +        struct hmap_node *node;

> >> +        struct conn *conn;

> >> +

> >> +        node = hmap_at_position(bucket, inner_bucket,

> inner_offset);

> >> +

> >> +        if (!node) {

> >> +            hmap_shrink(bucket);

> >> +            break;

> >> +        }

> >> +

> >> +        INIT_CONTAINER(conn, node, node);

> >> +        if (conn_expired(conn, now)) {

> >> +            hmap_remove(bucket, &conn->node);

> >> +            delete_conn(conn);

> >> +            (*left)--;

> >> +        }

> >> +    }

> >> +}

> >> +

> >> +/* Cleans up old connection entries.  Should be called

> periodically. */

> >> +void

> >> +conntrack_run(struct conntrack *ct)

> >> +{

> >> +    unsigned bucket = hash_to_bucket(ct->purge_bucket);

> >> +    uint32_t inner_bucket = ct->purge_inner_bucket,

> >> +             inner_offset = ct->purge_inner_offset;

> >> +    unsigned left = CONNTRACK_PURGE_NUM;

> >> +    long long now = time_msec();

> >> +

> >> +    while (bucket < CONNTRACK_BUCKETS) {

> >> +        ct_lock_lock(&ct->locks[bucket]);

> >> +        sweep_bucket(&ct->connections[bucket],

> >> +                     &inner_bucket, &inner_offset,

> >> +                     &left, now);

> >> +        ct_lock_unlock(&ct->locks[bucket]);

> >> +

> >> +        if (left == 0) {

> >> +            break;

> >> +        } else {

> >> +            bucket++;

> >> +        }

> >> +    }

> >> +

> >> +    ct->purge_bucket = bucket;

> >> +    ct->purge_inner_bucket = inner_bucket;

> >> +    ct->purge_inner_offset = inner_offset;

> >> +}

> >> +

> >

> >> +/* Key extraction */

> >> +

> >> +/* The function stores a pointer to the first byte after the

> header in

> >> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the

> caller is

> >> + * not interested in the header's tail,  meaning that the header

> has

> >> + * already been parsed (e.g. by flow_extract): we take this as a

> hint to

> >> + * save a few checks. */

> >> +static inline bool

> >> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t

> size,

> >> +                const char **new_data)

> >> +{

> >> +    const struct ip_header *ip = data;

> >> +

> >> +    if (new_data) {

> >> +        size_t ip_len;

> >> +

> >> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {

> >> +            return false;

> >> +        }

> >> +        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;

> >> +

> >> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {

> >> +            return false;

> >> +        }

> >> +        if (OVS_UNLIKELY(size < ip_len)) {

> >> +            return false;

> >> +        }

> >> +

> >> +        *new_data = (char *) data + ip_len;

> >> +    }

> >> +

> >> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    key->src.addr.ipv4 = ip->ip_src;

> >> +    key->dst.addr.ipv4 = ip->ip_dst;

> >> +    key->nw_proto = ip->ip_proto;

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +/* The function stores a pointer to the first byte after the

> header in

> >> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the

> caller is

> >> + * not interested in the header's tail,  meaning that the header

> has

> >> + * already been parsed (e.g. by flow_extract): we take this as a

> hint to

> >> + * save a few checks. */

> >> +static inline bool

> >> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t

> size,

> >> +                const char **new_data)

> >> +{

> >> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;

> >> +    uint8_t nw_proto = ip6->ip6_nxt;

> >> +    uint8_t nw_frag = 0;

> >> +

> >> +    if (new_data) {

> >> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {

> >> +            return false;

> >> +        }

> >> +    }

> >> +

> >> +    data = ip6 + 1;

> >> +    size -=  sizeof *ip6;

> >> +

> >> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag))

> {

> >> +        return false;

> >> +    }

> >> +

> >> +    if (new_data) {

> >> +        *new_data = data;

> >> +    }

> >> +

> >> +    if (nw_frag) {

> >> +        return false;

> >> +    }

> >> +

> >> +    key->src.addr.ipv6 = ip6->ip6_src;

> >> +    key->dst.addr.ipv6 = ip6->ip6_dst;

> >> +    key->nw_proto = nw_proto;

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +static inline bool

> >> +check_l4_tcp(const void *data, size_t size)

> >> +{

> >> +    const struct tcp_header *tcp = data;

> >> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;

> >> +

> >> +    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size))

> {

> >> +        return true;

> >> +    }

> >> +

> >> +    return false;

> >> +}

> >> +

> >> +static inline bool

> >> +extract_l4_tcp(struct conn_key *key, const void *data, size_t

> size)

> >> +{

> >> +    const struct tcp_header *tcp = data;

> >> +

> >> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    key->src.port = tcp->tcp_src;

> >> +    key->dst.port = tcp->tcp_dst;

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +static inline bool

> >> +extract_l4_udp(struct conn_key *key, const void *data, size_t

> size)

> >> +{

> >> +    const struct udp_header *udp = data;

> >> +

> >> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    key->src.port = udp->udp_src;

> >> +    key->dst.port = udp->udp_dst;

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +static inline bool extract_l4(struct conn_key *key, const void

> *data,

> >> +                              size_t size, bool *related);

> >> +

> >> +/* If 'related' is not NULL and the function is processing an

> ICMP

> >> + * error packet, extract the l3 and l4 fields from the nested

> header

> >> + * instead and set *related to true.  If 'related' is NULL we're

> >> + * already processing a nested header and no such recursion is

> >> + * possible */

> >> +static inline int

> >> +extract_l4_icmp(struct conn_key *key, const void *data, size_t

> size,

> >> +                bool *related)

> >> +{

> >> +    const struct icmp_header *icmp = data;

> >> +

> >> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {

> >> +        return false;

> >> +    }

> >> +

> >> +    switch (icmp->icmp_type) {

> >> +    case ICMP4_ECHO_REQUEST:

> >> +    case ICMP4_ECHO_REPLY:

> >> +    case ICMP4_TIMESTAMP:

> >> +    case ICMP4_TIMESTAMPREPLY:

> >> +    case ICMP4_INFOREQUEST:

> >> +    case ICMP4_INFOREPLY:

> >> +        /* Separate ICMP connection: identified using id */

> >> +        key->src.port = key->dst.port = icmp-

> >icmp_fields.echo.id;

> >> +        break;

> >> +    case ICMP4_DST_UNREACH:

> >> +    case ICMP4_TIME_EXCEEDED:

> >> +    case ICMP4_PARAM_PROB:

> >> +    case ICMP4_SOURCEQUENCH:

> >> +    case ICMP4_REDIRECT: {

> >> +        /* ICMP packet part of another connection. We should

> >> +         * extract the key from embedded packet header */

> >> +        struct conn_key inner_key;

> >> +        const char *l3 = (const char *) (icmp + 1);

> >> +        const char *tail = (const char *) data + size;

> >> +        const char *l4;

> >> +        bool ok;

> >> +

> >> +        if (!related) {

> >> +            return false;

> >> +        }

> >> +        *related = true;

> >> +

> >> +        memset(&inner_key, 0, sizeof inner_key);

> >> +        inner_key.dl_type = htons(ETH_TYPE_IP);

> >> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);

> >> +        if (!ok) {

> >> +            return false;

> >> +        }

> >> +

> >> +        /* pf doesn't do this, but it seems a good idea */

> >> +        if (inner_key.src.addr.ipv4_aligned != key-

> >dst.addr.ipv4_aligned

> >> +            || inner_key.dst.addr.ipv4_aligned != key-

> >src.addr.ipv4_aligned) {

> >> +            return false;

> >> +        }

> >> +

> >> +        key->src = inner_key.src;

> >> +        key->dst = inner_key.dst;

> >> +        key->nw_proto = inner_key.nw_proto;

> >> +

> >> +        ok = extract_l4(key, l4, tail - l4, NULL);

> >> +        if (ok) {

> >> +            conn_key_reverse(key);

> >> +        }

> >> +        return ok;

> >> +    }

> >> +    default:

> >> +        return false;

> >> +    }

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +/* If 'related' is not NULL and the function is processing an

> ICMP

> >> + * error packet, extract the l3 and l4 fields from the nested

> header

> >> + * instead and set *related to true.  If 'related' is NULL we're

> >> + * already processing a nested header and no such recursion is

> >> + * possible */

> >> +static inline bool

> >> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t

> size,

> >> +                 bool *related)

> >> +{

> >> +    const struct icmp6_header *icmp6 = data;

> >> +

> >> +    /* All the messages that we support need at least 4 bytes

> after

> >> +     * the header */

> >> +    if (size < sizeof *icmp6 + 4) {

> >> +        return false;

> >> +    }

> >> +

> >> +    switch (icmp6->icmp6_type) {

> >> +    case ICMP6_ECHO_REQUEST:

> >> +    case ICMP6_ECHO_REPLY:

> >> +        /* Separate ICMP connection: identified using id */

> >> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 +

> 1);

> >> +        break;

> >> +    case ICMP6_DST_UNREACH:

> >> +    case ICMP6_PACKET_TOO_BIG:

> >> +    case ICMP6_TIME_EXCEEDED:

> >> +    case ICMP6_PARAM_PROB:{

> >> +        /* ICMP packet part of another connection. We should

> >> +         * extract the key from embedded packet header */

> >> +        struct conn_key inner_key;

> >> +        const char *l3 = (const char *) icmp6 + 8;

> >> +        const char *tail = (const char *) data + size;

> >> +        const char *l4 = NULL;

> >> +        bool ok;

> >> +

> >> +        if (!related) {

> >> +            return false;

> >> +        }

> >> +        *related = true;

> >> +

> >> +        memset(&inner_key, 0, sizeof inner_key);

> >> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);

> >> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);

> >> +        if (!ok) {

> >> +            return false;

> >> +        }

> >> +

> >> +        /* pf doesn't do this, but it seems a good idea */

> >> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,

> >> +                              &key->dst.addr.ipv6_aligned)

> >> +            ||

> !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,

> >> +                                 &key->src.addr.ipv6_aligned)) {

> >> +            return false;

> >> +        }

> >> +

> >> +        key->src = inner_key.src;

> >> +        key->dst = inner_key.dst;

> >> +        key->nw_proto = inner_key.nw_proto;

> >> +

> >> +        ok = extract_l4(key, l4, tail - l4, NULL);

> >> +        if (ok) {

> >> +            conn_key_reverse(key);

> >> +        }

> >> +        return ok;

> >> +    }

> >> +    default:

> >> +        return false;

> >> +    }

> >> +

> >> +    return true;

> >> +}

> >> +

> >> +/* Extract l4 fields into 'key', which must already contain valid

> l3

> >> + * members.  If 'related' is not NULL and an ICMP error packet is

> being

> >> + * processed, the function will extract the key from the packet

> nested

> >> + * in the ICMP paylod and set '*related' to true.  If 'related'

> is NULL,

> >> + * nested parsing isn't allowed.  This is necessary to limit the

> >> + * recursion level. */

> >> +static inline bool

> >> +extract_l4(struct conn_key *key, const void *data, size_t size,

> bool *related)

> >> +{

> >> +    if (key->nw_proto == IPPROTO_TCP) {

> >> +        return extract_l4_tcp(key, data, size)

> >> +               && (!related || check_l4_tcp(data, size));

> >> +    } else if (key->nw_proto == IPPROTO_UDP) {

> >> +        return extract_l4_udp(key, data, size);

> >> +    } else if (key->dl_type == htons(ETH_TYPE_IP)

> >> +               && key->nw_proto == IPPROTO_ICMP) {

> >> +        return extract_l4_icmp(key, data, size, related);

> >> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)

> >> +               && key->nw_proto == IPPROTO_ICMPV6) {

> >> +        return extract_l4_icmp6(key, data, size, related);

> >> +    } else {

> >> +        return false;

> >> +    }

> >> +}

> >> +

> >> +static bool

> >> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,

> >> +                 struct conn_lookup_ctx *ctx, uint16_t zone)

> >> +{

> >> +    const struct eth_header *l2 = dp_packet_l2(pkt);

> >> +    const struct ip_header *l3 = dp_packet_l3(pkt);

> >> +    const char *l4 = dp_packet_l4(pkt);

> >> +    const char *tail = dp_packet_tail(pkt);

> >> +    bool ok;

> >> +

> >> +    memset(ctx, 0, sizeof *ctx);

> >> +

> >> +    if (!l2 || !l3 || !l4) {

> >> +        return false;

> >> +    }

> >> +

> >> +    ctx->key.zone = zone;

> >> +

> >> +    /* XXX In this function we parse the packet (again, it has

> already

> >> +     * gone through miniflow_extract()) for two reasons:

> >> +     *

> >> +     * 1) To extract the l3 addresses and l4 ports.

> >> +     *    We already have the l3 and l4 headers' pointers.

> Extracting

> >> +     *    the l3 addresses and the l4 ports is really cheap,

> since they

> >> +     *    can be found at fixed locations.

> >> +     * 2) To extract the l3 and l4 types.

> >> +     *    Extracting the l3 and l4 types (especially the l3[1])

> on the

> >> +     *    other hand is quite expensive, because they're not at a

> >> +     *    fixed location.

> >> +     *

> >> +     * Here's a way to avoid (2) with the help of the datapath.

> >> +     * The datapath doesn't keep the packet's extracted flow[2],

> so

> >> +     * using that is not an option.  We could use the packet's

> matching

> >> +     * megaflow, but we have to make sure that the l3 and l4

> types

> >> +     * are unwildcarded.  This means either:

> >> +     *

> >> +     * a) dpif-netdev unwildcards the l3 (and l4) types when a

> new flow

> >> +     *    is installed if the actions contains ct().  This is

> what the

> >> +     *    kernel datapath does.  It is not so straightforward,

> though.

> >> +     *

> >> +     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types

> when

> >> +     *    translating a ct() action.  This is already done in

> different

> >> +     *    actions and since both the userspace and the kernel

> datapath

> >> +     *    would benefit from it, it seems an appropriate place to

> do

> >> +     *    it.

> >> +     *

> >> +     * ---

> >> +     * [1] A simple benchmark (running only the connection

> tracker

> >> +     *     over and over on the same packets) shows that if the

> >> +     *     l3 type is already provided we are 15% faster (running

> the

> >> +     *     connection tracker over a couple of DPDK devices with

> a

> >> +     *     stream of UDP 64-bytes packets shows that we are 4%

> faster).

> >> +     *

> >> +     * [2] The reasons for this are that keeping the flow

> increases

> >> +     *     (slightly) the cache footprint and increases

> computation

> >> +     *     time as we move the packet around. Most importantly,

> the flow

> >> +     *     should be updated by the actions and this can be slow,

> as

> >> +     *     we use a sparse representation (miniflow).

> >> +     *

> >> +     */

> >> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *)

> l2);

> >> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {

> >

> >[Antonio F] In case we specify a flow with VLAN + CT like

> >add_flow brX table=0,in_port=1,vlan_tci=0xf123,tcp,ct_state=-

> trk,action=ct(commit,zone=9),2

> >should we also consider ETH_TYPE_VLAN in the previous conditional

> statement?

> 

> parse_dl_type() should skip vlans and return the L3 type. The

> connection tracker

> is interested in what's beyond the vlan tag.


[Antonio F] ok, got it.


> 

> >

> >

> >> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3,

> NULL);

> >> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {

> >> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3,

> NULL);

> >> +    } else {

> >> +        ok = false;

> >> +    }

> >> +

> >> +    if (ok) {

> >> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related))

> {

> >> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);

> >> +            return true;

> >> +        }

> >> +    }

> >> +

> >> +    return false;

> >> +}

> >> +

> >

> >> +/* Symmetric */

> >> +static uint32_t

> >> +conn_key_hash(const struct conn_key *key, uint32_t basis)

> >> +{

> >> +    uint32_t hsrc, hdst, hash;

> >> +    int i;

> >> +

> >> +    hsrc = hdst = basis;

> >> +

> >> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {

> >> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);

> >> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);

> >> +    }

> >> +

> >> +    hash = hsrc ^ hdst;

> >> +

> >> +    hash = hash_words((uint32_t *) &key->dst + 1,

> >> +                      (uint32_t *) (key + 1) - (uint32_t *)

> (&key->dst + 1),

> >> +                      hash);

> >

> >[Antonio F]

> >Is that to involve dl_type, nw_proto and zone in the hash

> computation?

> 

> Yes, I'll add a comment to make that clear

> 

> >

> >

> >> +

> >> +    return hash;

> >> +}

> >> +

> >> +static void

> >> +conn_key_reverse(struct conn_key *key)

> >> +{

> >> +    struct ct_endpoint tmp;

> >> +    tmp = key->src;

> >> +    key->src = key->dst;

> >> +    key->dst = tmp;

> >> +}

> >> +

> >> +static void

> >> +conn_key_lookup(struct conntrack *ct,

> >> +                struct conn_lookup_ctx *ctx,

> >> +                unsigned bucket,

> >> +                long long now)

> >> +{

> >> +    struct conn *conn, *found = NULL;

> >> +    uint32_t hash = ctx->hash;

> >> +    bool reply;

> >> +

> >> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct-

> >> >connections[bucket]) {

> >> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {

> >> +            found = conn;

> >> +            reply = false;

> >> +            break;

> >> +        }

> >> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn-

> >rev_key))) {

> >> +            found = conn;

> >> +            reply = true;

> >> +            break;

> >> +        }

> >> +    }

> >> +

> >> +    if (found) {

> >> +        if (conn_expired(found, now)) {

> >> +            found = NULL;

> >> +        } else {

> >> +            ctx->reply = reply;

> >> +        }

> >> +    }

> >> +

> >> +    ctx->conn = found;

> >> +}

> >> +

> >> +static enum ct_update_res

> >> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,

> >> +            long long now)

> >> +{

> >> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt,

> reply,

> >> now);

> >> +}

> >> +

> >> +static bool

> >> +conn_expired(struct conn *conn, long long now)

> >> +{

> >> +    return now >= conn->expiration;

> >> +}

> >> +

> >> +static bool

> >> +valid_new(struct dp_packet *pkt, struct conn_key *key)

> >> +{

> >> +    return l4_protos[key->nw_proto]->valid_new(pkt);

> >> +}

> >> +

> >> +static struct conn *

> >> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long

> now)

> >> +{

> >> +    struct conn *newconn;

> >> +

> >> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);

> >> +

> >> +    if (newconn) {

> >> +        newconn->key = *key;

> >> +    }

> >> +

> >> +    return newconn;

> >> +}

> >> +

> >> +static void

> >> +delete_conn(struct conn *conn)

> >> +{

> >> +    free(conn);

> >> +}

> >> diff --git a/lib/conntrack.h b/lib/conntrack.h

> >> new file mode 100644

> >> index 0000000..8561273

> >> --- /dev/null

> >> +++ b/lib/conntrack.h

> >> @@ -0,0 +1,144 @@

> >> +/*

> >> + * Copyright (c) 2015, 2016 Nicira, Inc.

> >> + *

> >> + * Licensed under the Apache License, Version 2.0 (the

> "License");

> >> + * you may not use this file except in compliance with the

> License.

> >> + * You may obtain a copy of the License at:

> >> + *

> >> + *     http://www.apache.org/licenses/LICENSE-2.0

> >> + *

> >> + * Unless required by applicable law or agreed to in writing,

> software

> >> + * distributed under the License is distributed on an "AS IS"

> BASIS,

> >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express

> or

> >> implied.

> >> + * See the License for the specific language governing

> permissions and

> >> + * limitations under the License.

> >> + */

> >> +

> >> +#ifndef CONNTRACK_H

> >> +#define CONNTRACK_H 1

> >> +

> >> +#include <stdbool.h>

> >> +

> >> +#include "hmap.h"

> >> +#include "netdev-dpdk.h"

> >> +#include "odp-netlink.h"

> >> +#include "openvswitch/thread.h"

> >> +#include "openvswitch/types.h"

> >> +

> >> +

> >> +struct dp_packet;

> >> +

> >> +/* Userspace connection tracker

> >> + * ============================

> >> + *

> >> + * This is a connection tracking module that keeps all the state

> in userspace.

> >> + *

> >> + * Usage

> >> + * =====

> >> + *

> >> + *     struct conntract ct;

> >> + *

> >> + * Initialization:

> >> + *

> >> + *     conntrack_init(&ct);

> >> + *

> >> + * It is necessary to periodically issue a call to

> >> + *

> >> + *     conntrack_run(&ct);

> >> + *

> >> + * to allow the module to clean up expired connections.

> >> + *

> >> + * To send a group of packets through the connection tracker:

> >> + *

> >> + *     conntrack_execute(&ct, pkts, n_pkts, ...);

> >> + *

> >> + * Thread-safety

> >> + * =============

> >> + *

> >> + * conntrack_execute() can be called by multiple threads

> simultaneoulsy.

> >> + */

> >> +

> >> +struct conntrack;

> >> +

> >> +void conntrack_init(struct conntrack *);

> >> +void conntrack_run(struct conntrack *);

> >> +void conntrack_destroy(struct conntrack *);

> >> +

> >> +int conntrack_execute(struct conntrack *, struct dp_packet **,

> size_t,

> >> +                      bool commit, uint16_t zone, const uint32_t

> *setmark,

> >> +                      const struct ovs_key_ct_labels *setlabel,

> >> +                      const char *helper);

> >> +

> >

> >> +/* struct ct_lock is a standard mutex or a spinlock when using

> DPDK */

> >> +

> >> +#ifdef DPDK_NETDEV

> >> +struct OVS_LOCKABLE ct_lock {

> >> +    rte_spinlock_t lock;

> >> +};

> >> +

> >> +static inline void ct_lock_init(struct ct_lock *lock)

> >> +{

> >> +    rte_spinlock_init(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_lock(struct ct_lock *lock)

> >> +    OVS_ACQUIRES(lock)

> >> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> >> +{

> >> +    rte_spinlock_lock(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_unlock(struct ct_lock *lock)

> >> +    OVS_RELEASES(lock)

> >> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> >> +{

> >> +    rte_spinlock_unlock(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_destroy(struct ct_lock *lock

> OVS_UNUSED)

> >> +{

> >> +}

> >> +#else

> >> +struct OVS_LOCKABLE ct_lock {

> >> +    struct ovs_mutex lock;

> >> +};

> >> +

> >> +static inline void ct_lock_init(struct ct_lock *lock)

> >> +{

> >> +    ovs_mutex_init(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_lock(struct ct_lock *lock)

> >> +    OVS_ACQUIRES(lock)

> >> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> >> +{

> >> +    ovs_mutex_lock(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_unlock(struct ct_lock *lock)

> >> +    OVS_RELEASES(lock)

> >> +    OVS_NO_THREAD_SAFETY_ANALYSIS

> >> +{

> >> +    ovs_mutex_unlock(&lock->lock);

> >> +}

> >> +

> >> +static inline void ct_lock_destroy(struct ct_lock *lock)

> >> +{

> >> +    ovs_mutex_destroy(&lock->lock);

> >> +}

> >> +#endif

> >> +

> >

> >> +#define CONNTRACK_BUCKETS_SHIFT 8

> >> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)

> >> +

> >> +struct conntrack {

> >> +    /* Each lock guards a 'connections' bucket */

> >> +    struct ct_lock locks[CONNTRACK_BUCKETS];

> >> +    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;

> >> +    uint32_t hash_basis;

> >> +    unsigned purge_bucket;

> >> +    uint32_t purge_inner_bucket;

> >> +    uint32_t purge_inner_offset;

> >> +};

> >> +

> >> +#endif /* conntrack.h */

> >> --

> >> 2.1.4

> >>

> >> _______________________________________________

> >> dev mailing list

> >> dev@openvswitch.org

> >> http://openvswitch.org/mailman/listinfo/dev
Flavio Leitner May 14, 2016, 4:20 a.m. UTC | #8
On Wed, Apr 27, 2016 at 06:36:20AM +0000, Daniele Di Proietto wrote:
> Thank you for your comments, Flavio!

Thank you for the patchset :-)

> 
> On 26/04/2016 16:35, "Flavio Leitner" <fbl@sysclose.org> wrote:
> 
> >On Fri, Apr 15, 2016 at 05:02:36PM -0700, Daniele Di Proietto wrote:
[...]

> >> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> >> new file mode 100644
> >> index 0000000..e668c44
> >> --- /dev/null
> >> +++ b/lib/conntrack-private.h
> >> @@ -0,0 +1,77 @@
> >> +/*
> >> + * Copyright (c) 2015, 2016 Nicira, Inc.
> >> + *
> >> + * Licensed under the Apache License, Version 2.0 (the "License");
> >> + * you may not use this file except in compliance with the License.
> >> + * You may obtain a copy of the License at:
> >> + *
> >> + *     http://www.apache.org/licenses/LICENSE-2.0
> >> + *
> >> + * Unless required by applicable law or agreed to in writing, software
> >> + * distributed under the License is distributed on an "AS IS" BASIS,
> >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> >> + * See the License for the specific language governing permissions and
> >> + * limitations under the License.
> >> + */
> >> +
> >> +#ifndef CONNTRACK_PRIVATE_H
> >> +#define CONNTRACK_PRIVATE_H 1
> >> +
> >> +#include <sys/types.h>
> >> +#include <netinet/in.h>
> >> +#include <netinet/ip6.h>
> >> +
> >> +#include "hmap.h"
> >> +#include "openvswitch/types.h"
> >> +#include "packets.h"
> >> +#include "unaligned.h"
> >> +
> >> +struct ct_addr {
> >> +    union {
> >> +        ovs_16aligned_be32 ipv4;
> >> +        union ovs_16aligned_in6_addr ipv6;
> >> +        ovs_be32 ipv4_aligned;
> >> +        struct in6_addr ipv6_aligned;
> >> +    };
> >> +};
> >> +
> >> +struct ct_endpoint {
> >> +    struct ct_addr addr;
> >> +    ovs_be16 port;
> >> +};
> >> +
> >> +struct conn_key {
> >> +    struct ct_endpoint src;
> >> +    struct ct_endpoint dst;
> >> +
> >> +    ovs_be16 dl_type;
> >> +    uint8_t nw_proto;
> >> +    uint16_t zone;
> >> +};
> >
> >Based on the above I presume we consider all IPs in the same space
> >regardless of the vlan, correct?
> 
> Yes, I think this is what the kernel connection tracker does.

That's right.


> >> +
> >> +struct conn {
> >> +    struct conn_key key;
> >> +    struct conn_key rev_key;
> >> +    long long expiration;
> >> +    struct hmap_node node;
> >> +    uint32_t mark;
> >> +    ovs_u128 label;
> >> +};
> >> +
> >> +enum ct_update_res {
> >> +    CT_UPDATE_INVALID,
> >> +    CT_UPDATE_VALID,
> >> +    CT_UPDATE_NEW,
> >> +};
> >> +
> >> +struct ct_l4_proto {
> >> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
> >> +    bool (*valid_new)(struct dp_packet *pkt);
> >> +    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
> >> +                                      bool reply, long long now);
> >> +};
> >> +
> >> +extern struct ct_l4_proto ct_proto_tcp;
> >> +extern struct ct_l4_proto ct_proto_other;
> >> +
> >> +#endif /* conntrack-private.h */
> >> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
> >> new file mode 100644
> >> index 0000000..4d80038
> >> --- /dev/null
> >> +++ b/lib/conntrack-tcp.c
> >> @@ -0,0 +1,476 @@
> >> +/*-
> >> + * Copyright (c) 2001 Daniel Hartmeier
> >> + * Copyright (c) 2002 - 2008 Henning Brauer
> >> + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
> >> + * Copyright (c) 2015, 2016 Nicira, Inc.
> >> + * All rights reserved.
> >> + *
> >> + * Redistribution and use in source and binary forms, with or without
> >> + * modification, are permitted provided that the following conditions
> >> + * are met:
> >> + *
> >> + *    - Redistributions of source code must retain the above copyright
> >> + *      notice, this list of conditions and the following disclaimer.
> >> + *    - Redistributions in binary form must reproduce the above
> >> + *      copyright notice, this list of conditions and the following
> >> + *      disclaimer in the documentation and/or other materials provided
> >> + *      with the distribution.
> >> + *
> >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
> >> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> >> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> >> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
> >> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
> >> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
> >> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
> >> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> >> + * POSSIBILITY OF SUCH DAMAGE.
> >> + *
> >> + * Effort sponsored in part by the Defense Advanced Research Projects
> >> + * Agency (DARPA) and Air Force Research Laboratory, Air Force
> >> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.
> >> + *
> >> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
> >> + */
> >> +
> >> +#include <config.h>
> >> +
> >> +#include "conntrack-private.h"
> >> +#include "ct-dpif.h"
> >> +#include "dp-packet.h"
> >> +#include "util.h"
> >> +
> >> +struct tcp_peer {
> >> +    enum ct_dpif_tcp_state state;
> >> +    uint32_t               seqlo;          /* Max sequence number sent     */
> >> +    uint32_t               seqhi;          /* Max the other end ACKd + win */
> >> +    uint16_t               max_win;        /* largest window (pre scaling) */
> >> +    uint8_t                wscale;         /* window scaling factor        */
> >> +};
> >> +
> >> +struct conn_tcp {
> >> +    struct conn up;
> >> +    struct tcp_peer peer[2];
> >> +};
> >> +
> >> +enum {
> >> +    TCPOPT_EOL,
> >> +    TCPOPT_NOP,
> >> +    TCPOPT_WINDOW = 3,
> >> +};
> >> +
> >> +/* TCP sequence numbers are 32 bit integers operated
> >> + * on with modular arithmetic.  These macros can be
> >> + * used to compare such integers. */
> >> +#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)
> >> +#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)
> >> +#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)
> >> +#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)
> >> +
> >> +#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))
> >> +#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))
> >
> >I am okay to leave those here, but since they are generic
> >operations, maybe we could define somewhere else and just
> >redefine here as SEQ_ operations.
> 
> I am fine both ways. Do you have a suggestion for the name
> of the generic macro?

s/SEQ/INT/ ? Then put somewhere common and just redefine
here SEQ_LT() to be INT_LT()?  I am not good with names :-)
Just that it seems to be generic enough to be useful in 
other places.


[...]
> >> +static uint8_t
> >> +tcp_get_wscale(const struct tcp_header *tcp)
> >> +{
> >> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
> >> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);
> >> +    uint8_t wscale = 0;
> >> +    uint8_t optlen;
> >> +
> >> +    while (len >= 3) {
> >> +        if (*opt == TCPOPT_EOL) {
> >> +            break;
> >> +        }
> >> +        switch (*opt) {
> >
> >Maybe we could do:
> >
> >           case TCPOPT_EOL:
> >               break;
> 
> I think 
> 
> The goal was to exit the while loop. I guess we could do
> 
>              case TCPOPT_EOL:
>                  return wscale;     

That looks better to me.

Thanks!
fbl
diff mbox

Patch

diff --git a/COPYING b/COPYING
index 308e3ea..afb98b9 100644
--- a/COPYING
+++ b/COPYING
@@ -25,6 +25,7 @@  License, version 2.
 The following files are licensed under the 2-clause BSD license.
     include/windows/getopt.h
     lib/getopt_long.c
+    lib/conntrack-tcp.c
 
 The following files are licensed under the 3-clause BSD-license
     include/windows/netinet/icmp6.h
diff --git a/debian/copyright.in b/debian/copyright.in
index 57d007a..a15f4dd 100644
--- a/debian/copyright.in
+++ b/debian/copyright.in
@@ -21,6 +21,9 @@  Upstream Copyright Holders:
 	Copyright (c) 2014 Michael Chapman
 	Copyright (c) 2014 WindRiver, Inc.
 	Copyright (c) 2014 Avaya, Inc.
+	Copyright (c) 2001 Daniel Hartmeier
+	Copyright (c) 2002 - 2008 Henning Brauer
+	Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
 
 License:
 
@@ -90,6 +93,7 @@  License:
 	lib/getopt_long.c
 	include/windows/getopt.h
 	datapath-windows/ovsext/Conntrack-tcp.c
+	lib/conntrack-tcp.c
 
 * The following files are licensed under the 3-clause BSD-license
 
diff --git a/lib/automake.mk b/lib/automake.mk
index 1ec2115..ba30442 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -47,6 +47,11 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/compiler.h \
 	lib/connectivity.c \
 	lib/connectivity.h \
+	lib/conntrack-private.h \
+	lib/conntrack-tcp.c \
+	lib/conntrack-other.c \
+	lib/conntrack.c \
+	lib/conntrack.h \
 	lib/coverage.c \
 	lib/coverage.h \
 	lib/crc32c.c \
diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
new file mode 100644
index 0000000..65d02a9
--- /dev/null
+++ b/lib/conntrack-other.c
@@ -0,0 +1,91 @@ 
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "dp-packet.h"
+
+enum other_state {
+    OTHERS_FIRST,
+    OTHERS_MULTIPLE,
+    OTHERS_BIDIR,
+};
+
+struct conn_other {
+    struct conn up;
+    enum other_state state;
+};
+
+static const long long other_timeouts[] = {
+    [OTHERS_FIRST] = 60 * 1000,
+    [OTHERS_MULTIPLE] = 60 * 1000,
+    [OTHERS_BIDIR] = 30 * 1000,
+};
+
+static struct conn_other *
+conn_other_cast(const struct conn *conn)
+{
+    return CONTAINER_OF(conn, struct conn_other, up);
+}
+
+static void
+update_expiration(struct conn_other *conn, long long now)
+{
+    conn->up.expiration = now + other_timeouts[conn->state];
+}
+
+static enum ct_update_res
+other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
+                bool reply, long long now)
+{
+    struct conn_other *conn = conn_other_cast(conn_);
+
+    if (reply && conn->state != OTHERS_BIDIR) {
+        conn->state = OTHERS_BIDIR;
+    } else if (conn->state == OTHERS_FIRST) {
+        conn->state = OTHERS_MULTIPLE;
+    }
+
+    update_expiration(conn, now);
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+other_valid_new(struct dp_packet *pkt OVS_UNUSED)
+{
+    return true;
+}
+
+static struct conn *
+other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
+{
+    struct conn_other *conn;
+
+    conn = xzalloc(sizeof(struct conn_other));
+    conn->state = OTHERS_FIRST;
+
+    update_expiration(conn, now);
+
+    return &conn->up;
+}
+
+struct ct_l4_proto ct_proto_other = {
+    .new_conn = other_new_conn,
+    .valid_new = other_valid_new,
+    .conn_update = other_conn_update,
+};
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
new file mode 100644
index 0000000..e668c44
--- /dev/null
+++ b/lib/conntrack-private.h
@@ -0,0 +1,77 @@ 
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_PRIVATE_H
+#define CONNTRACK_PRIVATE_H 1
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "hmap.h"
+#include "openvswitch/types.h"
+#include "packets.h"
+#include "unaligned.h"
+
+struct ct_addr {
+    union {
+        ovs_16aligned_be32 ipv4;
+        union ovs_16aligned_in6_addr ipv6;
+        ovs_be32 ipv4_aligned;
+        struct in6_addr ipv6_aligned;
+    };
+};
+
+struct ct_endpoint {
+    struct ct_addr addr;
+    ovs_be16 port;
+};
+
+struct conn_key {
+    struct ct_endpoint src;
+    struct ct_endpoint dst;
+
+    ovs_be16 dl_type;
+    uint8_t nw_proto;
+    uint16_t zone;
+};
+
+struct conn {
+    struct conn_key key;
+    struct conn_key rev_key;
+    long long expiration;
+    struct hmap_node node;
+    uint32_t mark;
+    ovs_u128 label;
+};
+
+enum ct_update_res {
+    CT_UPDATE_INVALID,
+    CT_UPDATE_VALID,
+    CT_UPDATE_NEW,
+};
+
+struct ct_l4_proto {
+    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
+    bool (*valid_new)(struct dp_packet *pkt);
+    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
+                                      bool reply, long long now);
+};
+
+extern struct ct_l4_proto ct_proto_tcp;
+extern struct ct_l4_proto ct_proto_other;
+
+#endif /* conntrack-private.h */
diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
new file mode 100644
index 0000000..4d80038
--- /dev/null
+++ b/lib/conntrack-tcp.c
@@ -0,0 +1,476 @@ 
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    - Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    - Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "ct-dpif.h"
+#include "dp-packet.h"
+#include "util.h"
+
+struct tcp_peer {
+    enum ct_dpif_tcp_state state;
+    uint32_t               seqlo;          /* Max sequence number sent     */
+    uint32_t               seqhi;          /* Max the other end ACKd + win */
+    uint16_t               max_win;        /* largest window (pre scaling) */
+    uint8_t                wscale;         /* window scaling factor        */
+};
+
+struct conn_tcp {
+    struct conn up;
+    struct tcp_peer peer[2];
+};
+
+enum {
+    TCPOPT_EOL,
+    TCPOPT_NOP,
+    TCPOPT_WINDOW = 3,
+};
+
+/* TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic.  These macros can be
+ * used to compare such integers. */
+#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)
+#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)
+#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)
+#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)
+
+#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))
+
+static struct conn_tcp*
+conn_tcp_cast(const struct conn* conn)
+{
+    return CONTAINER_OF(conn, struct conn_tcp, up);
+}
+
+/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
+ * is enabled.  We're not scrubbing, but this check seems reasonable.  */
+static bool
+tcp_invalid_flags(uint16_t flags)
+{
+
+    if (flags & TCP_SYN) {
+        if (flags & TCP_RST) {
+            return true;
+        }
+        if (flags & TCP_FIN) {
+            /* Here pf removes the fin flag.  We simply mark the packet as
+             * invalid */
+            return true;
+        }
+    } else {
+        /* Illegal packet */
+        if (!(flags & (TCP_ACK|TCP_RST))) {
+            return true;
+        }
+    }
+
+    if (!(flags & TCP_ACK)) {
+        /* These flags are only valid if ACK is set */
+        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+#define TCP_MAX_WSCALE 14
+#define CT_WSCALE_FLAG 0x80
+#define CT_WSCALE_UNKNOWN 0x40
+#define CT_WSCALE_MASK 0xf
+
+static uint8_t
+tcp_get_wscale(const struct tcp_header *tcp)
+{
+    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
+    const uint8_t *opt = (const uint8_t *)(tcp + 1);
+    uint8_t wscale = 0;
+    uint8_t optlen;
+
+    while (len >= 3) {
+        if (*opt == TCPOPT_EOL) {
+            break;
+        }
+        switch (*opt) {
+        case TCPOPT_NOP:
+            opt++;
+            len--;
+            break;
+        case TCPOPT_WINDOW:
+            wscale = MIN(opt[2], TCP_MAX_WSCALE);
+            wscale |= CT_WSCALE_FLAG;
+            /* fall through */
+        default:
+            optlen = opt[1];
+            if (optlen < 2) {
+                optlen = 2;
+            }
+            len -= optlen;
+            opt += optlen;
+        }
+    }
+
+    return wscale;
+}
+
+static uint32_t
+tcp_payload_length(struct dp_packet *pkt)
+{
+    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
+           - (char *) dp_packet_get_tcp_payload(pkt);
+}
+
+static void
+update_expiration(struct conn_tcp *conn, long long now, long long interval)
+{
+    conn->up.expiration = now + interval;
+}
+
+
+static enum ct_update_res
+tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
+                long long now)
+{
+    struct conn_tcp *conn = conn_tcp_cast(conn_);
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    /* The peer that sent 'pkt' */
+    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
+    /* The peer that should receive 'pkt' */
+    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
+    uint8_t sws = 0, dws = 0;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    uint16_t win = ntohs(tcp->tcp_winsz);
+    uint32_t ack, end, seq, orig_seq;
+    uint32_t p_len = tcp_payload_length(pkt);
+    int ackskew;
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return CT_UPDATE_INVALID;
+    }
+
+    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&
+            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
+            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
+        return CT_UPDATE_NEW;
+    }
+
+    if (src->wscale & CT_WSCALE_FLAG
+        && dst->wscale & CT_WSCALE_FLAG
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = src->wscale & CT_WSCALE_MASK;
+        dws = dst->wscale & CT_WSCALE_MASK;
+
+    } else if (src->wscale & CT_WSCALE_UNKNOWN
+        && dst->wscale & CT_WSCALE_UNKNOWN
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = TCP_MAX_WSCALE;
+        dws = TCP_MAX_WSCALE;
+    }
+
+    /*
+     * Sequence tracking algorithm from Guido van Rooij's paper:
+     *   http://www.madison-gurkha.com/publications/tcp_filtering/
+     *      tcp_filtering.ps
+     */
+
+    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
+        /* First packet from this end. Set its state */
+
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+            if (dst->wscale & CT_WSCALE_FLAG) {
+                src->wscale = tcp_get_wscale(tcp);
+                if (src->wscale & CT_WSCALE_FLAG) {
+                    /* Remove scale factor from initial window */
+                    sws = src->wscale & CT_WSCALE_MASK;
+                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
+                    dws = dst->wscale & CT_WSCALE_MASK;
+                } else {
+                    /* fixup other window */
+                    dst->max_win <<= dst->wscale &
+                        CT_WSCALE_MASK;
+                    /* in case of a retrans SYN|ACK */
+                    dst->wscale = 0;
+                }
+            }
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+
+        src->seqlo = seq;
+        src->state = CT_DPIF_TCPS_SYN_SENT;
+        /*
+         * May need to slide the window (seqhi may have been set by
+         * the crappy stack check or if we picked up the connection
+         * after establishment)
+         */
+        if (src->seqhi == 1 ||
+                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
+            src->seqhi = end + MAX(1, dst->max_win << dws);
+        }
+        if (win > src->max_win) {
+            src->max_win = win;
+        }
+
+    } else {
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+    }
+
+    if ((tcp_flags & TCP_ACK) == 0) {
+        /* Let it pass through the ack skew check */
+        ack = dst->seqlo;
+    } else if ((ack == 0
+                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
+               /* broken tcp stacks do not set ack */) {
+        /* Many stacks (ours included) will set the ACK number in an
+         * FIN|ACK if the SYN times out -- no sequence to ACK. */
+        ack = dst->seqlo;
+    }
+
+    if (seq == end) {
+        /* Ease sequencing restrictions on no data packets */
+        seq = src->seqlo;
+        end = seq;
+    }
+
+    ackskew = dst->seqlo - ack;
+#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
+    if (SEQ_GEQ(src->seqhi, end)
+        /* Last octet inside other's window space */
+        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
+        /* Retrans: not more than one window back */
+        && (ackskew >= -MAXACKWINDOW)
+        /* Acking not more than one reassembled fragment backwards */
+        && (ackskew <= (MAXACKWINDOW << sws))
+        /* Acking not more than one window forward */
+        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
+            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
+        /* Require an exact/+1 sequence match on resets when possible */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /* update states */
+        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
+                src->state = CT_DPIF_TCPS_SYN_SENT;
+        }
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+                src->state = CT_DPIF_TCPS_CLOSING;
+        }
+        if (tcp_flags & TCP_ACK) {
+            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
+                dst->state = CT_DPIF_TCPS_ESTABLISHED;
+            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
+                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
+            }
+        }
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+
+        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
+            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+            update_expiration(conn, now, 30 * 1000);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn, now, 45 * 1000);
+        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
+                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
+            update_expiration(conn, now, 30 * 1000);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn, now, 15 * 60 * 1000);
+        } else {
+            update_expiration(conn, now, 24 * 60 * 60 * 1000);
+        }
+    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
+                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
+               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
+               /* Within a window forward of the originating packet */
+               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+               /* Within a window backward of the originating packet */
+
+        /*
+         * This currently handles three situations:
+         *  1) Stupid stacks will shotgun SYNs before their peer
+         *     replies.
+         *  2) When PF catches an already established stream (the
+         *     firewall rebooted, the state table was flushed, routes
+         *     changed...)
+         *  3) Packets get funky immediately after the connection
+         *     closes (this should catch Solaris spurious ACK|FINs
+         *     that web servers like to spew after a close)
+         *
+         * This must be a little more careful than the above code
+         * since packet floods will also be caught here. We don't
+         * update the TTL here to mitigate the damage of a packet
+         * flood and so the same code can handle awkward establishment
+         * and a loosened connection close.
+         * In the establishment case, a correct peer response will
+         * validate the connection, go through the normal state code
+         * and keep updating the state TTL.
+         */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /*
+         * Cannot set dst->seqhi here since this could be a shotgunned
+         * SYN and not an already established connection.
+         */
+
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+            src->state = CT_DPIF_TCPS_CLOSING;
+        }
+
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+    } else {
+        return CT_UPDATE_INVALID;
+    }
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+tcp_valid_new(struct dp_packet *pkt)
+{
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return false;
+    }
+
+    /* A syn+ack is not allowed to create a connection.  We want to allow
+     * totally new connections (syn) or already established, not partially
+     * open (syn+ack). */
+    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
+        return false;
+    }
+
+    return true;
+}
+
+static struct conn *
+tcp_new_conn(struct dp_packet *pkt, long long now)
+{
+    struct conn_tcp* newconn = NULL;
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    struct tcp_peer *src, *dst;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    newconn = xzalloc(sizeof(struct conn_tcp));
+
+    src = &newconn->peer[0];
+    dst = &newconn->peer[1];
+
+    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
+
+    if (tcp_flags & TCP_SYN) {
+        src->seqhi++;
+        src->wscale = tcp_get_wscale(tcp);
+    } else {
+        src->wscale = CT_WSCALE_UNKNOWN;
+        dst->wscale = CT_WSCALE_UNKNOWN;
+    }
+    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
+    if (src->wscale & CT_WSCALE_MASK) {
+        /* Remove scale factor from initial window */
+        uint8_t sws = src->wscale & CT_WSCALE_MASK;
+        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
+    }
+    if (tcp_flags & TCP_FIN) {
+        src->seqhi++;
+    }
+    dst->seqhi = 1;
+    dst->max_win = 1;
+    src->state = CT_DPIF_TCPS_SYN_SENT;
+    dst->state = CT_DPIF_TCPS_CLOSED;
+
+    update_expiration(newconn, now, 30 * 1000);
+
+    return &newconn->up;
+}
+
+struct ct_l4_proto ct_proto_tcp = {
+    .new_conn = tcp_new_conn,
+    .valid_new = tcp_valid_new,
+    .conn_update = tcp_conn_update,
+};
diff --git a/lib/conntrack.c b/lib/conntrack.c
new file mode 100644
index 0000000..840335b
--- /dev/null
+++ b/lib/conntrack.c
@@ -0,0 +1,851 @@ 
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "conntrack.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include "bitmap.h"
+#include "conntrack-private.h"
+#include "dp-packet.h"
+#include "flow.h"
+#include "hmap.h"
+#include "netdev.h"
+#include "odp-netlink.h"
+#include "openvswitch/vlog.h"
+#include "ovs-rcu.h"
+#include "random.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(conntrack);
+
+struct conn_lookup_ctx {
+    struct conn_key key;
+    struct conn *conn;
+    uint32_t hash;
+    bool reply;
+    bool related;
+};
+
+static bool conn_key_extract(struct conntrack *, struct dp_packet *,
+                             struct conn_lookup_ctx *, uint16_t zone);
+static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
+static void conn_key_reverse(struct conn_key *);
+static void conn_key_lookup(struct conntrack *ct,
+                            struct conn_lookup_ctx *ctx,
+                            unsigned bucket,
+                            long long now);
+static bool valid_new(struct dp_packet *pkt, struct conn_key *);
+static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
+                             long long now);
+static void delete_conn(struct conn *);
+static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
+                                      bool reply, long long now);
+static bool conn_expired(struct conn *, long long now);
+static void set_mark(struct dp_packet *, struct conn *,
+                     uint32_t val, uint32_t mask);
+static void set_label(struct dp_packet *, struct conn *,
+                      const struct ovs_key_ct_labels *val,
+                      const struct ovs_key_ct_labels *mask);
+
+static struct ct_l4_proto *l4_protos[] = {
+    [IPPROTO_TCP] = &ct_proto_tcp,
+    [IPPROTO_UDP] = &ct_proto_other,
+    [IPPROTO_ICMP] = &ct_proto_other,
+};
+
+/* Initializes the connection tracker 'ct'.  The caller is responbile for
+ * calling 'conntrack_destroy()', when the instance is not needed anymore */
+void
+conntrack_init(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        ct_lock_init(&ct->locks[i]);
+        ct_lock_lock(&ct->locks[i]);
+        hmap_init(&ct->connections[i]);
+        ct_lock_unlock(&ct->locks[i]);
+    }
+    ct->hash_basis = random_uint32();
+    ct->purge_bucket = 0;
+    ct->purge_inner_bucket = 0;
+    ct->purge_inner_offset = 0;
+}
+
+/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
+void
+conntrack_destroy(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        struct conn *conn, *next;
+
+        ct_lock_lock(&ct->locks[i]);
+        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {
+            hmap_remove(&ct->connections[i], &conn->node);
+            delete_conn(conn);
+        }
+        hmap_destroy(&ct->connections[i]);
+        ct_lock_unlock(&ct->locks[i]);
+        ct_lock_destroy(&ct->locks[i]);
+    }
+}
+
+static unsigned hash_to_bucket(uint32_t hash)
+{
+    /* Extracts the most significant bits in hash. The least significant bits
+     * are already used internally by the hmap implementation. */
+    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
+
+    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
+}
+
+static void
+write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,
+            uint32_t mark, ovs_u128 label)
+{
+    pkt->md.ct_state = state | CS_TRACKED;
+    pkt->md.ct_zone = zone;
+    pkt->md.ct_mark = mark;
+    pkt->md.ct_label = label;
+}
+
+static struct conn *
+conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
+               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,
+               long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *nc = NULL;
+
+    if (!valid_new(pkt, &ctx->key)) {
+        *state |= CS_INVALID;
+        return nc;
+    }
+
+    *state |= CS_NEW;
+
+    if (commit) {
+        nc = new_conn(pkt, &ctx->key, now);
+
+        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
+
+        conn_key_reverse(&nc->rev_key);
+        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);
+    }
+
+    return nc;
+}
+
+static struct conn *
+process_one(struct conntrack *ct, struct dp_packet *pkt,
+            struct conn_lookup_ctx *ctx, uint16_t zone,
+            bool commit, long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *conn = ctx->conn;
+    uint8_t state = 0;
+
+    if (conn) {
+        if (ctx->related) {
+            state |= CS_RELATED;
+            if (ctx->reply) {
+                state |= CS_REPLY_DIR;
+            }
+        } else {
+            enum ct_update_res res;
+
+            res = conn_update(conn, pkt, ctx->reply, now);
+
+            switch (res) {
+            case CT_UPDATE_VALID:
+                state |= CS_ESTABLISHED;
+                if (ctx->reply) {
+                    state |= CS_REPLY_DIR;
+                }
+                break;
+            case CT_UPDATE_INVALID:
+                state |= CS_INVALID;
+                break;
+            case CT_UPDATE_NEW:
+                hmap_remove(&ct->connections[bucket], &conn->node);
+                delete_conn(conn);
+                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+                break;
+            }
+        }
+
+        pkt->md.ct_label = conn->label;
+        pkt->md.ct_mark = conn->mark;
+        write_ct_md(pkt, state, zone, conn->mark, conn->label);
+    } else {
+        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});
+    }
+
+    return conn;
+}
+
+/* Sends a group of 'cnt' packets ('pkts') through the connection tracker
+ * 'ct'.  If 'commit' is true, the packets are allowed to create new entries
+ * in the connection tables.  'setmark', if not NULL, should point to a two
+ * elements array containing a value and a mask to set the connection mark.
+ * 'setlabel' behaves similarly for the connection label.*/
+int
+conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,
+                  bool commit, uint16_t zone, const uint32_t *setmark,
+                  const struct ovs_key_ct_labels *setlabel,
+                  const char *helper)
+{
+#if !defined(__CHECKER__) && !defined(_WIN32)
+    const size_t KEY_ARRAY_SIZE = cnt;
+#else
+    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
+#endif
+    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
+    int8_t bucket_list[CONNTRACK_BUCKETS];
+    struct {
+        unsigned bucket;
+        unsigned long maps;
+    } arr[KEY_ARRAY_SIZE];
+    long long now = time_msec();
+    size_t i = 0;
+    uint8_t arrcnt = 0;
+
+    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
+
+    if (helper) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
+        /* Continue without the helper */
+    }
+
+    memset(bucket_list, INT8_C(-1), sizeof bucket_list);
+    for (i = 0; i < cnt; i++) {
+        unsigned bucket;
+
+        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
+            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});
+            continue;
+        }
+
+        bucket = hash_to_bucket(ctxs[i].hash);
+        if (bucket_list[bucket] == INT8_C(-1)) {
+            bucket_list[bucket] = arrcnt;
+
+            arr[arrcnt].maps = 0;
+            ULLONG_SET1(arr[arrcnt].maps, i);
+            arr[arrcnt++].bucket = bucket;
+        } else {
+            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
+            arr[bucket_list[bucket]].maps |= 1UL << i;
+        }
+    }
+
+    for (i = 0; i < arrcnt; i++) {
+        size_t j;
+
+        ct_lock_lock(&ct->locks[arr[i].bucket]);
+
+        ULLONG_FOR_EACH_1(j, arr[i].maps) {
+            struct conn *conn;
+
+            conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now);
+
+            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
+
+            if (conn && setmark) {
+                set_mark(pkts[j], conn, setmark[0], setmark[1]);
+            }
+
+            if (conn && setlabel) {
+                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
+            }
+        }
+        ct_lock_unlock(&ct->locks[arr[i].bucket]);
+    }
+
+    return 0;
+}
+
+static void
+set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
+{
+    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
+    conn->mark = pkt->md.ct_mark;
+}
+
+static void
+set_label(struct dp_packet *pkt, struct conn *conn,
+          const struct ovs_key_ct_labels *val,
+          const struct ovs_key_ct_labels *mask)
+{
+    ovs_u128 v, m;
+
+    memcpy(&v, val, sizeof v);
+    memcpy(&m, mask, sizeof m);
+
+    pkt->md.ct_label.u64.lo = v.u64.lo
+                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
+    pkt->md.ct_label.u64.hi = v.u64.hi
+                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
+    conn->label = pkt->md.ct_label;
+}
+
+#define CONNTRACK_PURGE_NUM 256
+
+static void
+sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,
+             uint32_t *inner_offset, unsigned *left, long long now)
+{
+    while (*left != 0) {
+        struct hmap_node *node;
+        struct conn *conn;
+
+        node = hmap_at_position(bucket, inner_bucket, inner_offset);
+
+        if (!node) {
+            hmap_shrink(bucket);
+            break;
+        }
+
+        INIT_CONTAINER(conn, node, node);
+        if (conn_expired(conn, now)) {
+            hmap_remove(bucket, &conn->node);
+            delete_conn(conn);
+            (*left)--;
+        }
+    }
+}
+
+/* Cleans up old connection entries.  Should be called periodically. */
+void
+conntrack_run(struct conntrack *ct)
+{
+    unsigned bucket = hash_to_bucket(ct->purge_bucket);
+    uint32_t inner_bucket = ct->purge_inner_bucket,
+             inner_offset = ct->purge_inner_offset;
+    unsigned left = CONNTRACK_PURGE_NUM;
+    long long now = time_msec();
+
+    while (bucket < CONNTRACK_BUCKETS) {
+        ct_lock_lock(&ct->locks[bucket]);
+        sweep_bucket(&ct->connections[bucket],
+                     &inner_bucket, &inner_offset,
+                     &left, now);
+        ct_lock_unlock(&ct->locks[bucket]);
+
+        if (left == 0) {
+            break;
+        } else {
+            bucket++;
+        }
+    }
+
+    ct->purge_bucket = bucket;
+    ct->purge_inner_bucket = inner_bucket;
+    ct->purge_inner_offset = inner_offset;
+}
+
+/* Key extraction */
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
+                const char **new_data)
+{
+    const struct ip_header *ip = data;
+
+    if (new_data) {
+        size_t ip_len;
+
+        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+            return false;
+        }
+        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
+
+        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
+            return false;
+        }
+        if (OVS_UNLIKELY(size < ip_len)) {
+            return false;
+        }
+
+        *new_data = (char *) data + ip_len;
+    }
+
+    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
+        return false;
+    }
+
+    key->src.addr.ipv4 = ip->ip_src;
+    key->dst.addr.ipv4 = ip->ip_dst;
+    key->nw_proto = ip->ip_proto;
+
+    return true;
+}
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
+                const char **new_data)
+{
+    const struct ovs_16aligned_ip6_hdr *ip6 = data;
+    uint8_t nw_proto = ip6->ip6_nxt;
+    uint8_t nw_frag = 0;
+
+    if (new_data) {
+        if (OVS_UNLIKELY(size < sizeof *ip6)) {
+            return false;
+        }
+    }
+
+    data = ip6 + 1;
+    size -=  sizeof *ip6;
+
+    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
+        return false;
+    }
+
+    if (new_data) {
+        *new_data = data;
+    }
+
+    if (nw_frag) {
+        return false;
+    }
+
+    key->src.addr.ipv6 = ip6->ip6_src;
+    key->dst.addr.ipv6 = ip6->ip6_dst;
+    key->nw_proto = nw_proto;
+
+    return true;
+}
+
+static inline bool
+check_l4_tcp(const void *data, size_t size)
+{
+    const struct tcp_header *tcp = data;
+    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool
+extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct tcp_header *tcp = data;
+
+    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = tcp->tcp_src;
+    key->dst.port = tcp->tcp_dst;
+
+    return true;
+}
+
+static inline bool
+extract_l4_udp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct udp_header *udp = data;
+
+    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = udp->udp_src;
+    key->dst.port = udp->udp_dst;
+
+    return true;
+}
+
+static inline bool extract_l4(struct conn_key *key, const void *data,
+                              size_t size, bool *related);
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline int
+extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
+                bool *related)
+{
+    const struct icmp_header *icmp = data;
+
+    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
+        return false;
+    }
+
+    switch (icmp->icmp_type) {
+    case ICMP4_ECHO_REQUEST:
+    case ICMP4_ECHO_REPLY:
+    case ICMP4_TIMESTAMP:
+    case ICMP4_TIMESTAMPREPLY:
+    case ICMP4_INFOREQUEST:
+    case ICMP4_INFOREPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
+        break;
+    case ICMP4_DST_UNREACH:
+    case ICMP4_TIME_EXCEEDED:
+    case ICMP4_PARAM_PROB:
+    case ICMP4_SOURCEQUENCH:
+    case ICMP4_REDIRECT: {
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) (icmp + 1);
+        const char *tail = (const char *) data + size;
+        const char *l4;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IP);
+        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
+            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline bool
+extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
+                 bool *related)
+{
+    const struct icmp6_header *icmp6 = data;
+
+    /* All the messages that we support need at least 4 bytes after
+     * the header */
+    if (size < sizeof *icmp6 + 4) {
+        return false;
+    }
+
+    switch (icmp6->icmp6_type) {
+    case ICMP6_ECHO_REQUEST:
+    case ICMP6_ECHO_REPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
+        break;
+    case ICMP6_DST_UNREACH:
+    case ICMP6_PACKET_TOO_BIG:
+    case ICMP6_TIME_EXCEEDED:
+    case ICMP6_PARAM_PROB:{
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) icmp6 + 8;
+        const char *tail = (const char *) data + size;
+        const char *l4 = NULL;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IPV6);
+        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
+                              &key->dst.addr.ipv6_aligned)
+            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
+                                 &key->src.addr.ipv6_aligned)) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* Extract l4 fields into 'key', which must already contain valid l3
+ * members.  If 'related' is not NULL and an ICMP error packet is being
+ * processed, the function will extract the key from the packet nested
+ * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,
+ * nested parsing isn't allowed.  This is necessary to limit the
+ * recursion level. */
+static inline bool
+extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)
+{
+    if (key->nw_proto == IPPROTO_TCP) {
+        return extract_l4_tcp(key, data, size)
+               && (!related || check_l4_tcp(data, size));
+    } else if (key->nw_proto == IPPROTO_UDP) {
+        return extract_l4_udp(key, data, size);
+    } else if (key->dl_type == htons(ETH_TYPE_IP)
+               && key->nw_proto == IPPROTO_ICMP) {
+        return extract_l4_icmp(key, data, size, related);
+    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
+               && key->nw_proto == IPPROTO_ICMPV6) {
+        return extract_l4_icmp6(key, data, size, related);
+    } else {
+        return false;
+    }
+}
+
+static bool
+conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
+                 struct conn_lookup_ctx *ctx, uint16_t zone)
+{
+    const struct eth_header *l2 = dp_packet_l2(pkt);
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    const char *l4 = dp_packet_l4(pkt);
+    const char *tail = dp_packet_tail(pkt);
+    bool ok;
+
+    memset(ctx, 0, sizeof *ctx);
+
+    if (!l2 || !l3 || !l4) {
+        return false;
+    }
+
+    ctx->key.zone = zone;
+
+    /* XXX In this function we parse the packet (again, it has already
+     * gone through miniflow_extract()) for two reasons:
+     *
+     * 1) To extract the l3 addresses and l4 ports.
+     *    We already have the l3 and l4 headers' pointers.  Extracting
+     *    the l3 addresses and the l4 ports is really cheap, since they
+     *    can be found at fixed locations.
+     * 2) To extract the l3 and l4 types.
+     *    Extracting the l3 and l4 types (especially the l3[1]) on the
+     *    other hand is quite expensive, because they're not at a
+     *    fixed location.
+     *
+     * Here's a way to avoid (2) with the help of the datapath.
+     * The datapath doesn't keep the packet's extracted flow[2], so
+     * using that is not an option.  We could use the packet's matching
+     * megaflow, but we have to make sure that the l3 and l4 types
+     * are unwildcarded.  This means either:
+     *
+     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow
+     *    is installed if the actions contains ct().  This is what the
+     *    kernel datapath does.  It is not so straightforward, though.
+     *
+     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when
+     *    translating a ct() action.  This is already done in different
+     *    actions and since both the userspace and the kernel datapath
+     *    would benefit from it, it seems an appropriate place to do
+     *    it.
+     *
+     * ---
+     * [1] A simple benchmark (running only the connection tracker
+     *     over and over on the same packets) shows that if the
+     *     l3 type is already provided we are 15% faster (running the
+     *     connection tracker over a couple of DPDK devices with a
+     *     stream of UDP 64-bytes packets shows that we are 4% faster).
+     *
+     * [2] The reasons for this are that keeping the flow increases
+     *     (slightly) the cache footprint and increases computation
+     *     time as we move the packet around. Most importantly, the flow
+     *     should be updated by the actions and this can be slow, as
+     *     we use a sparse representation (miniflow).
+     *
+     */
+    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
+    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
+        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);
+    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
+    } else {
+        ok = false;
+    }
+
+    if (ok) {
+        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {
+            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Symmetric */
+static uint32_t
+conn_key_hash(const struct conn_key *key, uint32_t basis)
+{
+    uint32_t hsrc, hdst, hash;
+    int i;
+
+    hsrc = hdst = basis;
+
+    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
+        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
+        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
+    }
+
+    hash = hsrc ^ hdst;
+
+    hash = hash_words((uint32_t *) &key->dst + 1,
+                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
+                      hash);
+
+    return hash;
+}
+
+static void
+conn_key_reverse(struct conn_key *key)
+{
+    struct ct_endpoint tmp;
+    tmp = key->src;
+    key->src = key->dst;
+    key->dst = tmp;
+}
+
+static void
+conn_key_lookup(struct conntrack *ct,
+                struct conn_lookup_ctx *ctx,
+                unsigned bucket,
+                long long now)
+{
+    struct conn *conn, *found = NULL;
+    uint32_t hash = ctx->hash;
+    bool reply;
+
+    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) {
+        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) {
+            found = conn;
+            reply = false;
+            break;
+        }
+        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) {
+            found = conn;
+            reply = true;
+            break;
+        }
+    }
+
+    if (found) {
+        if (conn_expired(found, now)) {
+            found = NULL;
+        } else {
+            ctx->reply = reply;
+        }
+    }
+
+    ctx->conn = found;
+}
+
+static enum ct_update_res
+conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
+            long long now)
+{
+    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);
+}
+
+static bool
+conn_expired(struct conn *conn, long long now)
+{
+    return now >= conn->expiration;
+}
+
+static bool
+valid_new(struct dp_packet *pkt, struct conn_key *key)
+{
+    return l4_protos[key->nw_proto]->valid_new(pkt);
+}
+
+static struct conn *
+new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
+{
+    struct conn *newconn;
+
+    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
+
+    if (newconn) {
+        newconn->key = *key;
+    }
+
+    return newconn;
+}
+
+static void
+delete_conn(struct conn *conn)
+{
+    free(conn);
+}
diff --git a/lib/conntrack.h b/lib/conntrack.h
new file mode 100644
index 0000000..8561273
--- /dev/null
+++ b/lib/conntrack.h
@@ -0,0 +1,144 @@ 
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_H
+#define CONNTRACK_H 1
+
+#include <stdbool.h>
+
+#include "hmap.h"
+#include "netdev-dpdk.h"
+#include "odp-netlink.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/types.h"
+
+
+struct dp_packet;
+
+/* Userspace connection tracker
+ * ============================
+ *
+ * This is a connection tracking module that keeps all the state in userspace.
+ *
+ * Usage
+ * =====
+ *
+ *     struct conntract ct;
+ *
+ * Initialization:
+ *
+ *     conntrack_init(&ct);
+ *
+ * It is necessary to periodically issue a call to
+ *
+ *     conntrack_run(&ct);
+ *
+ * to allow the module to clean up expired connections.
+ *
+ * To send a group of packets through the connection tracker:
+ *
+ *     conntrack_execute(&ct, pkts, n_pkts, ...);
+ *
+ * Thread-safety
+ * =============
+ *
+ * conntrack_execute() can be called by multiple threads simultaneoulsy.
+ */
+
+struct conntrack;
+
+void conntrack_init(struct conntrack *);
+void conntrack_run(struct conntrack *);
+void conntrack_destroy(struct conntrack *);
+
+int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,
+                      bool commit, uint16_t zone, const uint32_t *setmark,
+                      const struct ovs_key_ct_labels *setlabel,
+                      const char *helper);
+
+/* struct ct_lock is a standard mutex or a spinlock when using DPDK */
+
+#ifdef DPDK_NETDEV
+struct OVS_LOCKABLE ct_lock {
+    rte_spinlock_t lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+    rte_spinlock_init(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock) 
+    OVS_ACQUIRES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    rte_spinlock_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+    OVS_RELEASES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    rte_spinlock_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)
+{
+}
+#else
+struct OVS_LOCKABLE ct_lock {
+    struct ovs_mutex lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+    ovs_mutex_init(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock)
+    OVS_ACQUIRES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+    OVS_RELEASES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock)
+{
+    ovs_mutex_destroy(&lock->lock);
+}
+#endif
+
+#define CONNTRACK_BUCKETS_SHIFT 8
+#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
+
+struct conntrack {
+    /* Each lock guards a 'connections' bucket */
+    struct ct_lock locks[CONNTRACK_BUCKETS];
+    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;
+    uint32_t hash_basis;
+    unsigned purge_bucket;
+    uint32_t purge_inner_bucket;
+    uint32_t purge_inner_offset;
+};
+
+#endif /* conntrack.h */