diff mbox

[ovs-dev,patch_v2,1/4] Userspace Datapath: Add ALG infra and FTP.

Message ID 1497740019-4168-2-git-send-email-dlu998@gmail.com
State Superseded
Headers show

Commit Message

Darrell Ball June 17, 2017, 10:53 p.m. UTC
ALG infra and FTP (both V4 and V6) support is added to the userspace
datapath.  Also, NAT support is included.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
---
 lib/conntrack-private.h |   17 +
 lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----
 lib/conntrack.h         |    4 +-
 3 files changed, 957 insertions(+), 80 deletions(-)

Comments

Joe Stringer June 23, 2017, 11:08 p.m. UTC | #1
On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
> ALG infra and FTP (both V4 and V6) support is added to the userspace
> datapath.  Also, NAT support is included.
>
> Signed-off-by: Darrell Ball <dlu998@gmail.com>
> ---

Hi Darrell, thanks for the patch.

I wasn't able to test this out, because my build and test environments
depend on sparse correctly running and it wouldn't compile. Please
consider setting it up, or using Travis-CI to validate builds before
v3.

I didn't do a thorough review, but I scanned through and pointed out a
couple of parts I have questions about.

>  lib/conntrack-private.h |   17 +
>  lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----
>  lib/conntrack.h         |    4 +-
>  3 files changed, 957 insertions(+), 80 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 55084d3..993af33 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -62,17 +62,34 @@ struct nat_conn_key_node {
>      struct conn_key value;
>  };
>
> +struct alg_exp_node {
> +    struct hmap_node node;
> +    struct ovs_list exp_node;
> +    long long expiration;
> +    struct conn_key key;
> +    struct conn_key master_key;
> +    struct ct_addr alg_nat_repl_addr;
> +    ovs_u128 master_label;
> +    uint32_t master_mark;
> +};
> +
>  struct conn {
>      struct conn_key key;
>      struct conn_key rev_key;
> +    /* Only used for orig_tuple support. */
> +    struct conn_key master_key;

I guess no-one's looked at this structure from a cacheline
perspective, but if we think it's worth optimizing then perhaps a
followup patch should rearrange these. You might consider starting
here by just placing all of the ALG-related fields together at the
end.

>      long long expiration;
>      struct ovs_list exp_node;
>      struct hmap_node node;
>      ovs_u128 label;
>      /* XXX: consider flattening. */
>      struct nat_action_info_t *nat_info;
> +    char *alg;
> +    int seq_skew;
>      uint32_t mark;
>      uint8_t conn_type;
> +    uint8_t seq_skew_dir;
> +    uint8_t alg_related;
>  };
>
>  enum ct_update_res {
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 90b154a..1f54fe3 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2015, 2016 Nicira, Inc.
> + * Copyright (c) 2015, 2016, 2017 Nicira, Inc.

All three files could get an update like this.

>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -21,6 +21,7 @@
>  #include <sys/types.h>
>  #include <netinet/in.h>
>  #include <netinet/icmp6.h>
> +#include <ctype.h>

Alphabetic order in each section of #includes?

>
>  #include "bitmap.h"
>  #include "conntrack-private.h"
> @@ -39,7 +40,6 @@
>  #include "random.h"
>  #include "timeval.h"
>
> -
>  VLOG_DEFINE_THIS_MODULE(conntrack);
>
>  COVERAGE_DEFINE(conntrack_full);
> @@ -50,9 +50,21 @@ struct conn_lookup_ctx {
>      struct conn *conn;
>      uint32_t hash;
>      bool reply;
> +    /* XXX: Only used by ICMP; change name. */

Please consider sending a separate patch to change the name.

<snip>

> @@ -179,14 +241,20 @@ conntrack_destroy(struct conntrack *ct)
>          ct_lock_unlock(&ctb->lock);
>          ct_lock_destroy(&ctb->lock);
>      }
> -    ct_rwlock_wrlock(&ct->nat_resources_lock);
> +    ct_rwlock_wrlock(&ct->resources_lock);
>      struct nat_conn_key_node *nat_conn_key_node;
>      HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
>          free(nat_conn_key_node);
>      }
>      hmap_destroy(&ct->nat_conn_keys);
> -    ct_rwlock_unlock(&ct->nat_resources_lock);
> -    ct_rwlock_destroy(&ct->nat_resources_lock);
> +
> +    struct alg_exp_node *alg_exp_node;
> +    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
> +        free(alg_exp_node);
> +    }
> +    hmap_destroy(&ct->alg_expectations);

You might also consider poisoning ct->alg_exp_list.

<snip>

> @@ -363,8 +469,8 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
>          struct ip_header *nh = dp_packet_l3(pkt);
>          struct icmp_header *icmp = dp_packet_l4(pkt);
>          struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
> -        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
> -                        -pad, &inner_l4, false);
> +        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) -pad,
> +                        &inner_l4, false);

I guess this is just some automatic argument formatting that you
performed incidentally, but if it's getting changed, then usually we
put a space around operators such as '-'.

<snip>

> @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>          nc->rev_key = nc->key;
>          conn_key_reverse(&nc->rev_key);
>
> +        if (helper) {
> +            nc->alg = xstrdup(helper);
> +        }
> +
> +        if (alg_exp) {
> +            nc->alg_related = true;
> +            nc->mark = alg_exp->master_mark;
> +            nc->label = alg_exp->master_label;
> +            nc->master_key = alg_exp->master_key;
> +        }
> +
>          if (nat_action_info) {
>              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
> -            ct_rwlock_wrlock(&ct->nat_resources_lock);
> -
> -            bool nat_res = nat_select_range_tuple(ct, nc,
> -                                                  conn_for_un_nat_copy);
>
> -            if (!nat_res) {
> -                free(nc->nat_info);
> -                nc->nat_info = NULL;
> -                free (nc);
> -                ct_rwlock_unlock(&ct->nat_resources_lock);
> -                return NULL;
> -            }
> +            if (alg_exp) {
> +                nc->rev_key.src.addr = alg_nat_repl_addr;
> +                nc->nat_info->nat_action = NAT_ACTION_DST;
> +                *conn_for_un_nat_copy = *nc;
> +            } else {
> +                ct_rwlock_wrlock(&ct->resources_lock);
> +                bool nat_res = nat_select_range_tuple(
> +                                   ct, nc, conn_for_un_nat_copy);
> +
> +                if (!nat_res) {
> +                    free(nc->nat_info);
> +                    nc->nat_info = NULL;
> +                    free (nc);

I think that nc->alg may be leaked here? any reason it doesn't use
delete_conn()?

> +                    ct_rwlock_unlock(&ct->resources_lock);
> +                    return NULL;
> +                }
>
> -            if (conn_for_un_nat_copy &&
> -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>                  *nc = *conn_for_un_nat_copy;

Perhaps nc->alg and/or nc->nat_info may be leaked here?

<snip>

> @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
>          }
>      }
>
> +#define MAX_ALG_EXP_TO_EXPIRE 1000
> +    size_t alg_exp_count = hmap_count(&ct->alg_expectations);
> +    /* XXX: revisit this. */
> +    size_t max_to_expire =
> +        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);

I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,
not the lower bound.

<snip>

> @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
>      }
>  }
>
> +static struct alg_exp_node *
> +expectation_lookup(struct hmap *alg_expectations,
> +                   const struct conn_key *key,
> +                   uint32_t basis)

OVS_REQ_RDLOCK(...) ?

> +{
> +    struct conn_key check_key = *key;
> +    check_key.src.port = ALG_WC_SRC_PORT;
> +    struct alg_exp_node *alg_exp_node;
> +    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
> +    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
> +                             alg_exp_conn_key_hash,
> +                             alg_expectations) {
> +        if (!memcmp(&alg_exp_node->key, &check_key,
> +                    sizeof alg_exp_node->key)) {
> +            return alg_exp_node;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +expectation_create(struct conntrack *ct,
> +                   ovs_be16 dst_port,
> +                   const long long now,
> +                   enum ct_alg_mode mode,
> +                   const struct conn *master_conn)
> +{
> +    struct ct_addr src_addr;
> +    struct ct_addr dst_addr;
> +    struct ct_addr alg_nat_repl_addr;
> +
> +    switch (mode) {
> +    case CT_FTP_MODE_ACTIVE:
> +        src_addr = master_conn->rev_key.src.addr;
> +        dst_addr = master_conn->rev_key.dst.addr;
> +        alg_nat_repl_addr = master_conn->key.src.addr;
> +        break;
> +    case CT_FTP_MODE_PASSIVE:
> +        src_addr = master_conn->key.src.addr;
> +        dst_addr = master_conn->key.dst.addr;
> +        alg_nat_repl_addr = master_conn->rev_key.src.addr;
> +        break;
> +    default:
> +        OVS_NOT_REACHED();
> +    }
> +
> +    struct alg_exp_node *alg_exp_node =
> +        xzalloc(sizeof *alg_exp_node);
> +    alg_exp_node->key.dl_type = master_conn->key.dl_type;
> +    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
> +    alg_exp_node->key.zone = master_conn->key.zone;
> +    alg_exp_node->key.src.addr = src_addr;
> +    alg_exp_node->key.dst.addr = dst_addr;
> +    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
> +    alg_exp_node->key.dst.port = dst_port;
> +    alg_exp_node->master_mark = master_conn->mark;
> +    alg_exp_node->master_label = master_conn->label;
> +    alg_exp_node->master_key = master_conn->key;
> +    ct_rwlock_rdlock(&ct->resources_lock);
> +    struct alg_exp_node *alg_exp = expectation_lookup(
> +        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
> +    ct_rwlock_unlock(&ct->resources_lock);
> +    if (alg_exp) {
> +        free(alg_exp_node);
> +        return;
> +    }
> +
> +    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
> +    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
> +    uint32_t alg_exp_conn_key_hash =
> +        conn_key_hash(&alg_exp_node->key,
> +                      ct->hash_basis);
> +    ct_rwlock_wrlock(&ct->resources_lock);
> +    hmap_insert(&ct->alg_expectations,
> +                &alg_exp_node->node,
> +                alg_exp_conn_key_hash);
> +
> +    alg_exp_init_expiration(ct, alg_exp_node, now);
> +    ct_rwlock_unlock(&ct->resources_lock);

Is there a race where another thread inserts an equivalent expectation
between grabbing the readlock above and the writelock here?

> +}
> +
>  static void
>  conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
>                  long long now)
> @@ -1886,6 +2194,7 @@ static void
>  delete_conn(struct conn *conn)
>  {
>      free(conn->nat_info);
> +    free(conn->alg);
>      free(conn);
>  }
>
> @@ -1950,6 +2259,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
>      if (class->conn_get_protoinfo) {
>          class->conn_get_protoinfo(conn, &entry->protoinfo);
>      }
> +
> +    if (conn->alg) {
> +        entry->helper.name = xstrdup(conn->alg);
> +    }
>  }
>
>  int
> @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>          struct conn *conn, *next;
>
>          ct_lock_lock(&ct->buckets[i].lock);
> -        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
> +        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
>              if ((!zone || *zone == conn->key.zone) &&
>                  (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
>                  conn_clean(ct, conn, &ct->buckets[i]);
> @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>          }
>          ct_lock_unlock(&ct->buckets[i].lock);
>      }
> +
> +    ct_rwlock_wrlock(&ct->resources_lock);

Should this be nested within the bucket lock?

> +    struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
> +    HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
> +                       node, &ct->alg_expectations) {
> +        if (!zone || *zone == alg_exp_node->key.zone) {
> +            ovs_list_remove(&alg_exp_node->exp_node);
> +            hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
> +            free(alg_exp_node);
> +        }
> +    }
> +    ct_rwlock_unlock(&ct->resources_lock);
>      return 0;
>  }
> +
> +static uint8_t
> +get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
> +{
> +    return v4_addr >> (index * 8) & 0xff;

I think this is where the sparse warning is generated:

lib/conntrack.c:2390:12: error: restricted ovs_be32 degrades to integer

<snip>

> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v4(struct conntrack *ct,
> +                   struct conn_lookup_ctx *ctx,
> +                   struct dp_packet *pkt,
> +                   const struct conn *conn_for_expectation,
> +                   long long now, ovs_be32 *v4_addr_rep,
> +                   char **ftp_data_v4_start,
> +                   size_t *addr_offset_from_ftp_data_start)
> +{
> +
> +    struct tcp_header *th = dp_packet_l4(pkt);
> +    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +    char *tcp_hdr = (char *) th;
> +    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
> +    char *ftp = ftp_msg;
> +    enum ct_alg_mode mode;
> +
> +     detect_ftp_ctl_mode(ctx, pkt, ftp_msg);

Whitespace. Same for the other call to this function.

> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v6(struct conntrack *ct,
> +                   struct conn_lookup_ctx *ctx,
> +                   struct dp_packet *pkt,
> +                   const struct conn *conn_for_expectation,
> +                   long long now,
> +                   struct ct_addr *v6_addr_rep,
> +                   char **ftp_data_start,
> +                   size_t *addr_offset_from_ftp_data_start,
> +                   size_t *addr_size, enum ct_alg_mode *mode)

I didn't work through the logic of these functions with an actual
message; it'd be easier to do so if there were example messages in the
code, but then I think we generally just rely on the tests passing to
show the correctness. I see that everything is copied into a local
buffer for iteration first, so I assume you've done due diligence with
respect to validating seeks into the message.

<snip>

> +    const char *tail = dp_packet_tail(pkt);
> +    uint8_t pad = dp_packet_l2_pad_size(pkt);
> +    th->tcp_csum = 0;
> +    uint32_t tcp_csum;
> +    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> +        tcp_csum = packet_csum_pseudoheader6(nh6);
> +    } else {
> +        tcp_csum = packet_csum_pseudoheader(l3_hdr);
> +    }
> +    th->tcp_csum = csum_finish(
> +        csum_continue(tcp_csum, th, tail - (char *) th - pad));
> +    return;
> +}
> +

Extraneous newline at EOF?

> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 243aebb..0b110c7 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -267,11 +267,13 @@ struct conntrack {

...
>      /* This lock is used during NAT connection creation and deletion;
>       * it is taken after a bucket lock and given back before that
>       * bucket unlock.
>       */

Please update the comment, even if to generalize saying that it
protects access to 'nat_conn_keys', 'alg_expectations' and
'alg_exp_list', and should be nested within locking of the ct bucket
lock.

> -    struct ct_rwlock nat_resources_lock;
> +    struct ct_rwlock resources_lock;

This change could probably be refactored out as a separate patch to
begin, reduce churn in this patch, and be trivially applied to master
independently of the rest of the series.
Darrell Ball June 24, 2017, 1:57 a.m. UTC | #2
On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:

    On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    > ALG infra and FTP (both V4 and V6) support is added to the userspace

    > datapath.  Also, NAT support is included.

    >

    > Signed-off-by: Darrell Ball <dlu998@gmail.com>

    > ---

    
    Hi Darrell, thanks for the patch.
    
    I wasn't able to test this out, because my build and test environments
    depend on sparse correctly running and it wouldn't compile. Please
    consider setting it up, or using Travis-CI to validate builds before
    v3.

I see Travis found a sparse warning; strange, I always look for these
and still see none, I even changed the line you saw flagged
    > +    return v4_addr >> (index * 8) & 0xff;

and still no warning :-(. I usually even get extra spurious warnings.
However, from simple inspection right now, I would expect a “restricted type” 
warning; it is not a real problem, but, of course I’ll fix it up


    
    I didn't do a thorough review, but I scanned through and pointed out a
    couple of parts I have questions about.
    
    >  lib/conntrack-private.h |   17 +

    >  lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----

    >  lib/conntrack.h         |    4 +-

    >  3 files changed, 957 insertions(+), 80 deletions(-)

    >

    > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

    > index 55084d3..993af33 100644

    > --- a/lib/conntrack-private.h

    > +++ b/lib/conntrack-private.h

    > @@ -62,17 +62,34 @@ struct nat_conn_key_node {

    >      struct conn_key value;

    >  };

    >

    > +struct alg_exp_node {

    > +    struct hmap_node node;

    > +    struct ovs_list exp_node;

    > +    long long expiration;

    > +    struct conn_key key;

    > +    struct conn_key master_key;

    > +    struct ct_addr alg_nat_repl_addr;

    > +    ovs_u128 master_label;

    > +    uint32_t master_mark;

    > +};

    > +

    >  struct conn {

    >      struct conn_key key;

    >      struct conn_key rev_key;

    > +    /* Only used for orig_tuple support. */

    > +    struct conn_key master_key;

    
    I guess no-one's looked at this structure from a cacheline
    perspective, but if we think it's worth optimizing then perhaps a
    followup patch should rearrange these. You might consider starting
    here by just placing all of the ALG-related fields together at the
    end.

I intend some optimizations later, but it is outside the scope of this patch.  
I placed the fields presently based on similar function for one field - master_conn
and the other fields based on padding considerations.
I leave as is for the time being and revisit later.
    
    >      long long expiration;

    >      struct ovs_list exp_node;

    >      struct hmap_node node;

    >      ovs_u128 label;

    >      /* XXX: consider flattening. */

    >      struct nat_action_info_t *nat_info;

    > +    char *alg;

    > +    int seq_skew;

    >      uint32_t mark;

    >      uint8_t conn_type;

    > +    uint8_t seq_skew_dir;

    > +    uint8_t alg_related;

    >  };

    >

    >  enum ct_update_res {

    > diff --git a/lib/conntrack.c b/lib/conntrack.c

    > index 90b154a..1f54fe3 100644

    > --- a/lib/conntrack.c

    > +++ b/lib/conntrack.c

    > @@ -1,5 +1,5 @@

    >  /*

    > - * Copyright (c) 2015, 2016 Nicira, Inc.

    > + * Copyright (c) 2015, 2016, 2017 Nicira, Inc.

    
    All three files could get an update like this.

Yep, was on my todo list.
    
    >   *

    >   * Licensed under the Apache License, Version 2.0 (the "License");

    >   * you may not use this file except in compliance with the License.

    > @@ -21,6 +21,7 @@

    >  #include <sys/types.h>

    >  #include <netinet/in.h>

    >  #include <netinet/icmp6.h>

    > +#include <ctype.h>

    
    Alphabetic order in each section of #includes?

Yep, I had read that rule.
    
    >

    >  #include "bitmap.h"

    >  #include "conntrack-private.h"

    > @@ -39,7 +40,6 @@

    >  #include "random.h"

    >  #include "timeval.h"

    >

    > -

    >  VLOG_DEFINE_THIS_MODULE(conntrack);

    >

    >  COVERAGE_DEFINE(conntrack_full);

    > @@ -50,9 +50,21 @@ struct conn_lookup_ctx {

    >      struct conn *conn;

    >      uint32_t hash;

    >      bool reply;

    > +    /* XXX: Only used by ICMP; change name. */

    
    Please consider sending a separate patch to change the name.

I put the reminder here to do later, but it is relevant to this series.
Maybe, I’ll just fold in.
    
    <snip>
    
    > @@ -179,14 +241,20 @@ conntrack_destroy(struct conntrack *ct)

    >          ct_lock_unlock(&ctb->lock);

    >          ct_lock_destroy(&ctb->lock);

    >      }

    > -    ct_rwlock_wrlock(&ct->nat_resources_lock);

    > +    ct_rwlock_wrlock(&ct->resources_lock);

    >      struct nat_conn_key_node *nat_conn_key_node;

    >      HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {

    >          free(nat_conn_key_node);

    >      }

    >      hmap_destroy(&ct->nat_conn_keys);

    > -    ct_rwlock_unlock(&ct->nat_resources_lock);

    > -    ct_rwlock_destroy(&ct->nat_resources_lock);

    > +

    > +    struct alg_exp_node *alg_exp_node;

    > +    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {

    > +        free(alg_exp_node);

    > +    }

    > +    hmap_destroy(&ct->alg_expectations);

    
    You might also consider poisoning ct->alg_exp_list.

Sure, why not. 

    
    <snip>
    
    > @@ -363,8 +469,8 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)

    >          struct ip_header *nh = dp_packet_l3(pkt);

    >          struct icmp_header *icmp = dp_packet_l4(pkt);

    >          struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);

    > -        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)

    > -                        -pad, &inner_l4, false);

    > +        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) -pad,

    > +                        &inner_l4, false);

    
    I guess this is just some automatic argument formatting that you
    performed incidentally, but if it's getting changed, then usually we
    put a space around operators such as '-'.

I am aware of the spacing rule.
The original code had the missing space b4 ‘pad’
I’ll fix it.
    
    <snip>
    
    > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

    >          nc->rev_key = nc->key;

    >          conn_key_reverse(&nc->rev_key);

    >

    > +        if (helper) {

    > +            nc->alg = xstrdup(helper);

    > +        }

    > +

    > +        if (alg_exp) {

    > +            nc->alg_related = true;

    > +            nc->mark = alg_exp->master_mark;

    > +            nc->label = alg_exp->master_label;

    > +            nc->master_key = alg_exp->master_key;

    > +        }

    > +

    >          if (nat_action_info) {

    >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);

    > -            ct_rwlock_wrlock(&ct->nat_resources_lock);

    > -

    > -            bool nat_res = nat_select_range_tuple(ct, nc,

    > -                                                  conn_for_un_nat_copy);

    >

    > -            if (!nat_res) {

    > -                free(nc->nat_info);

    > -                nc->nat_info = NULL;

    > -                free (nc);

    > -                ct_rwlock_unlock(&ct->nat_resources_lock);

    > -                return NULL;

    > -            }

    > +            if (alg_exp) {

    > +                nc->rev_key.src.addr = alg_nat_repl_addr;

    > +                nc->nat_info->nat_action = NAT_ACTION_DST;

    > +                *conn_for_un_nat_copy = *nc;

    > +            } else {

    > +                ct_rwlock_wrlock(&ct->resources_lock);

    > +                bool nat_res = nat_select_range_tuple(

    > +                                   ct, nc, conn_for_un_nat_copy);

    > +

    > +                if (!nat_res) {

    > +                    free(nc->nat_info);

    > +                    nc->nat_info = NULL;

    > +                    free (nc);

    
    I think that nc->alg may be leaked here? any reason it doesn't use
    delete_conn()?

Good
Yes, alg will leak in this rare error case and yes, delete_conn() should be used
here, as everywhere.
    
    > +                    ct_rwlock_unlock(&ct->resources_lock);

    > +                    return NULL;

    > +                }

    >

    > -            if (conn_for_un_nat_copy &&

    > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {

    >                  *nc = *conn_for_un_nat_copy;

    
    Perhaps nc->alg and/or nc->nat_info may be leaked here?

No, the un_nat conn has no such allocations, so there is nothing to leak.
    
    <snip>
    
    > @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,

    >          }

    >      }

    >

    > +#define MAX_ALG_EXP_TO_EXPIRE 1000

    > +    size_t alg_exp_count = hmap_count(&ct->alg_expectations);

    > +    /* XXX: revisit this. */

    > +    size_t max_to_expire =

    > +        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);

    
    I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,
    not the lower bound.

If the number of expectations is not large (common case), then this number represents the lower bound. 
If the number of expectations is large, then it represents an upper bound.
    
    <snip>
    
    > @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,

    >      }

    >  }

    >

    > +static struct alg_exp_node *

    > +expectation_lookup(struct hmap *alg_expectations,

    > +                   const struct conn_key *key,

    > +                   uint32_t basis)

    
    OVS_REQ_RDLOCK(...) ?

Yep, missed this one
    
    > +{

    > +    struct conn_key check_key = *key;

    > +    check_key.src.port = ALG_WC_SRC_PORT;

    > +    struct alg_exp_node *alg_exp_node;

    > +    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);

    > +    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,

    > +                             alg_exp_conn_key_hash,

    > +                             alg_expectations) {

    > +        if (!memcmp(&alg_exp_node->key, &check_key,

    > +                    sizeof alg_exp_node->key)) {

    > +            return alg_exp_node;

    > +        }

    > +    }

    > +

    > +    return NULL;

    > +}

    > +

    > +static void

    > +expectation_create(struct conntrack *ct,

    > +                   ovs_be16 dst_port,

    > +                   const long long now,

    > +                   enum ct_alg_mode mode,

    > +                   const struct conn *master_conn)

    > +{

    > +    struct ct_addr src_addr;

    > +    struct ct_addr dst_addr;

    > +    struct ct_addr alg_nat_repl_addr;

    > +

    > +    switch (mode) {

    > +    case CT_FTP_MODE_ACTIVE:

    > +        src_addr = master_conn->rev_key.src.addr;

    > +        dst_addr = master_conn->rev_key.dst.addr;

    > +        alg_nat_repl_addr = master_conn->key.src.addr;

    > +        break;

    > +    case CT_FTP_MODE_PASSIVE:

    > +        src_addr = master_conn->key.src.addr;

    > +        dst_addr = master_conn->key.dst.addr;

    > +        alg_nat_repl_addr = master_conn->rev_key.src.addr;

    > +        break;

    > +    default:

    > +        OVS_NOT_REACHED();

    > +    }

    > +

    > +    struct alg_exp_node *alg_exp_node =

    > +        xzalloc(sizeof *alg_exp_node);

    > +    alg_exp_node->key.dl_type = master_conn->key.dl_type;

    > +    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;

    > +    alg_exp_node->key.zone = master_conn->key.zone;

    > +    alg_exp_node->key.src.addr = src_addr;

    > +    alg_exp_node->key.dst.addr = dst_addr;

    > +    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;

    > +    alg_exp_node->key.dst.port = dst_port;

    > +    alg_exp_node->master_mark = master_conn->mark;

    > +    alg_exp_node->master_label = master_conn->label;

    > +    alg_exp_node->master_key = master_conn->key;

    > +    ct_rwlock_rdlock(&ct->resources_lock);

    > +    struct alg_exp_node *alg_exp = expectation_lookup(

    > +        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);

    > +    ct_rwlock_unlock(&ct->resources_lock);

    > +    if (alg_exp) {

    > +        free(alg_exp_node);

    > +        return;

    > +    }

    > +

    > +    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;

    > +    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;

    > +    uint32_t alg_exp_conn_key_hash =

    > +        conn_key_hash(&alg_exp_node->key,

    > +                      ct->hash_basis);

    > +    ct_rwlock_wrlock(&ct->resources_lock);

    > +    hmap_insert(&ct->alg_expectations,

    > +                &alg_exp_node->node,

    > +                alg_exp_conn_key_hash);

    > +

    > +    alg_exp_init_expiration(ct, alg_exp_node, now);

    > +    ct_rwlock_unlock(&ct->resources_lock);

    
    Is there a race where another thread inserts an equivalent expectation
    between grabbing the readlock above and the writelock here?

This can’t happen practically because of how packets are assigned to threads
and how the protocols work.
It is theoretically possible with a very sophisticated exploit; but this would also be a weak exploit.
I had some other reasons to combine the critical sections however: the usual case is that the write
lock will be needed anyways; might as well take it even though this code only handles a tiny
subset of packets; also, readability is better.

    
    > +}

    > +

    >  static void

    >  conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,

    >                  long long now)

    > @@ -1886,6 +2194,7 @@ static void

    >  delete_conn(struct conn *conn)

    >  {

    >      free(conn->nat_info);

    > +    free(conn->alg);

    >      free(conn);

    >  }

    >

    > @@ -1950,6 +2259,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,

    >      if (class->conn_get_protoinfo) {

    >          class->conn_get_protoinfo(conn, &entry->protoinfo);

    >      }

    > +

    > +    if (conn->alg) {

    > +        entry->helper.name = xstrdup(conn->alg);

    > +    }

    >  }

    >

    >  int

    > @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)

    >          struct conn *conn, *next;

    >

    >          ct_lock_lock(&ct->buckets[i].lock);

    > -        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {

    > +        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {

    >              if ((!zone || *zone == conn->key.zone) &&

    >                  (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {

    >                  conn_clean(ct, conn, &ct->buckets[i]);

    > @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)

    >          }

    >          ct_lock_unlock(&ct->buckets[i].lock);

    >      }

    > +

    > +    ct_rwlock_wrlock(&ct->resources_lock);

    
    Should this be nested within the bucket lock?

No, expectations are protected by the resource lock. You
later correctly pointed out that I should update the comment about
the locking – I will do that.
    
    > +    struct alg_exp_node *alg_exp_node, *alg_exp_node_next;

    > +    HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,

    > +                       node, &ct->alg_expectations) {

    > +        if (!zone || *zone == alg_exp_node->key.zone) {

    > +            ovs_list_remove(&alg_exp_node->exp_node);

    > +            hmap_remove(&ct->alg_expectations, &alg_exp_node->node);

    > +            free(alg_exp_node);

    > +        }

    > +    }

    > +    ct_rwlock_unlock(&ct->resources_lock);

    >      return 0;

    >  }

    > +

    > +static uint8_t

    > +get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)

    > +{

    > +    return v4_addr >> (index * 8) & 0xff;

    
    I think this is where the sparse warning is generated:
    
    lib/conntrack.c:2390:12: error: restricted ovs_be32 degrades to integer

ok, as mentioned earlier, I cannot generate this warning, but I think the warning
should be there based on code reading; it is not a real problem, but I’ll fix it.
    
    <snip>
    
    > +static enum ftp_ctl_pkt

    > +process_ftp_ctl_v4(struct conntrack *ct,

    > +                   struct conn_lookup_ctx *ctx,

    > +                   struct dp_packet *pkt,

    > +                   const struct conn *conn_for_expectation,

    > +                   long long now, ovs_be32 *v4_addr_rep,

    > +                   char **ftp_data_v4_start,

    > +                   size_t *addr_offset_from_ftp_data_start)

    > +{

    > +

    > +    struct tcp_header *th = dp_packet_l4(pkt);

    > +    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;

    > +    char *tcp_hdr = (char *) th;

    > +    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};

    > +    char *ftp = ftp_msg;

    > +    enum ct_alg_mode mode;

    > +

    > +     detect_ftp_ctl_mode(ctx, pkt, ftp_msg);

    
    Whitespace. Same for the other call to this function.

Yep, both fixed
    
    > +static enum ftp_ctl_pkt

    > +process_ftp_ctl_v6(struct conntrack *ct,

    > +                   struct conn_lookup_ctx *ctx,

    > +                   struct dp_packet *pkt,

    > +                   const struct conn *conn_for_expectation,

    > +                   long long now,

    > +                   struct ct_addr *v6_addr_rep,

    > +                   char **ftp_data_start,

    > +                   size_t *addr_offset_from_ftp_data_start,

    > +                   size_t *addr_size, enum ct_alg_mode *mode)

    
    I didn't work through the logic of these functions with an actual
    message; it'd be easier to do so if there were example messages in the
    code, but then I think we generally just rely on the tests passing to
    show the correctness. I see that everything is copied into a local
    buffer for iteration first, so I assume you've done due diligence with
    respect to validating seeks into the message.
    
    <snip>
    
    > +    const char *tail = dp_packet_tail(pkt);

    > +    uint8_t pad = dp_packet_l2_pad_size(pkt);

    > +    th->tcp_csum = 0;

    > +    uint32_t tcp_csum;

    > +    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {

    > +        tcp_csum = packet_csum_pseudoheader6(nh6);

    > +    } else {

    > +        tcp_csum = packet_csum_pseudoheader(l3_hdr);

    > +    }

    > +    th->tcp_csum = csum_finish(

    > +        csum_continue(tcp_csum, th, tail - (char *) th - pad));

    > +    return;

    > +}

    > +

    
    Extraneous newline at EOF?

yep
    
    > diff --git a/lib/conntrack.h b/lib/conntrack.h

    > index 243aebb..0b110c7 100644

    > --- a/lib/conntrack.h

    > +++ b/lib/conntrack.h

    > @@ -267,11 +267,13 @@ struct conntrack {

    
    ...
    >      /* This lock is used during NAT connection creation and deletion;

    >       * it is taken after a bucket lock and given back before that

    >       * bucket unlock.

    >       */

    
    Please update the comment, even if to generalize saying that it
    protects access to 'nat_conn_keys', 'alg_expectations' and
    'alg_exp_list', and should be nested within locking of the ct bucket
    lock.

thanks for the reminder.
    
    > -    struct ct_rwlock nat_resources_lock;

    > +    struct ct_rwlock resources_lock;

    
    This change could probably be refactored out as a separate patch to
    begin, reduce churn in this patch, and be trivially applied to master
    independently of the rest of the series.

I thought about splicing it out as a separate patch - sure.
    _______________________________________________
    dev mailing list
    dev@openvswitch.org
    https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=xhOXA4NKJz5WP3lzFrYKiI5By1La7ibryhfvFSU77kU&s=HZBCg3WJOEBO9HgA7UlYksfllyZCL6QHVM-AXik1w3I&e=
Joe Stringer June 26, 2017, 5:22 p.m. UTC | #3
On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>
>     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     > ALG infra and FTP (both V4 and V6) support is added to the userspace
>     > datapath.  Also, NAT support is included.
>     >
>     > Signed-off-by: Darrell Ball <dlu998@gmail.com>
>     > ---
>
>     Hi Darrell, thanks for the patch.
>
>     I wasn't able to test this out, because my build and test environments
>     depend on sparse correctly running and it wouldn't compile. Please
>     consider setting it up, or using Travis-CI to validate builds before
>     v3.
>
> I see Travis found a sparse warning; strange, I always look for these
> and still see none, I even changed the line you saw flagged
>     > +    return v4_addr >> (index * 8) & 0xff;
> and still no warning :-(. I usually even get extra spurious warnings.
> However, from simple inspection right now, I would expect a “restricted type”
> warning; it is not a real problem, but, of course I’ll fix it up

OK, thanks!

>     I didn't do a thorough review, but I scanned through and pointed out a
>     couple of parts I have questions about.
>
>     >  lib/conntrack-private.h |   17 +
>     >  lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----
>     >  lib/conntrack.h         |    4 +-
>     >  3 files changed, 957 insertions(+), 80 deletions(-)
>     >
>     > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
>     > index 55084d3..993af33 100644
>     > --- a/lib/conntrack-private.h
>     > +++ b/lib/conntrack-private.h
>     > @@ -62,17 +62,34 @@ struct nat_conn_key_node {
>     >      struct conn_key value;
>     >  };
>     >
>     > +struct alg_exp_node {
>     > +    struct hmap_node node;
>     > +    struct ovs_list exp_node;
>     > +    long long expiration;
>     > +    struct conn_key key;
>     > +    struct conn_key master_key;
>     > +    struct ct_addr alg_nat_repl_addr;
>     > +    ovs_u128 master_label;
>     > +    uint32_t master_mark;
>     > +};
>     > +
>     >  struct conn {
>     >      struct conn_key key;
>     >      struct conn_key rev_key;
>     > +    /* Only used for orig_tuple support. */
>     > +    struct conn_key master_key;
>
>     I guess no-one's looked at this structure from a cacheline
>     perspective, but if we think it's worth optimizing then perhaps a
>     followup patch should rearrange these. You might consider starting
>     here by just placing all of the ALG-related fields together at the
>     end.
>
> I intend some optimizations later, but it is outside the scope of this patch.
> I placed the fields presently based on similar function for one field - master_conn
> and the other fields based on padding considerations.
> I leave as is for the time being and revisit later.

Sure, sounds good.

>     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >          nc->rev_key = nc->key;
>     >          conn_key_reverse(&nc->rev_key);
>     >
>     > +        if (helper) {
>     > +            nc->alg = xstrdup(helper);
>     > +        }
>     > +
>     > +        if (alg_exp) {
>     > +            nc->alg_related = true;
>     > +            nc->mark = alg_exp->master_mark;
>     > +            nc->label = alg_exp->master_label;
>     > +            nc->master_key = alg_exp->master_key;
>     > +        }
>     > +
>     >          if (nat_action_info) {
>     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     > -
>     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     > -                                                  conn_for_un_nat_copy);
>     >
>     > -            if (!nat_res) {
>     > -                free(nc->nat_info);
>     > -                nc->nat_info = NULL;
>     > -                free (nc);
>     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     > -                return NULL;
>     > -            }
>     > +            if (alg_exp) {
>     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     > +                *conn_for_un_nat_copy = *nc;
>     > +            } else {
>     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     > +                bool nat_res = nat_select_range_tuple(
>     > +                                   ct, nc, conn_for_un_nat_copy);
>     > +
>     > +                if (!nat_res) {
>     > +                    free(nc->nat_info);
>     > +                    nc->nat_info = NULL;
>     > +                    free (nc);
>
>     I think that nc->alg may be leaked here? any reason it doesn't use
>     delete_conn()?
>
> Good
> Yes, alg will leak in this rare error case and yes, delete_conn() should be used
> here, as everywhere.

OK.

>     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     > +                    return NULL;
>     > +                }
>     >
>     > -            if (conn_for_un_nat_copy &&
>     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >                  *nc = *conn_for_un_nat_copy;
>
>     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>
> No, the un_nat conn has no such allocations, so there is nothing to leak.

I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?

>     <snip>
>
>     > @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
>     >          }
>     >      }
>     >
>     > +#define MAX_ALG_EXP_TO_EXPIRE 1000
>     > +    size_t alg_exp_count = hmap_count(&ct->alg_expectations);
>     > +    /* XXX: revisit this. */
>     > +    size_t max_to_expire =
>     > +        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);
>
>     I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,
>     not the lower bound.
>
> If the number of expectations is not large (common case), then this number represents the lower bound.
> If the number of expectations is large, then it represents an upper bound.

alg_exp_count=10, then MAX(1, 1000) -> expire up to 1000 entries.

alg_exp_count=100000, then MAX(10000,1000) -> expire up to 10000 entries.

So, MAX_ALG_EXP_TO_EXPIRE isn't a maximum?

In practice it seems unlikely to end up in a situation with so many
expectations, but still I found this a bit confusing.

>     <snip>
>
>     > @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
>     >      }
>     >  }
>     >
>     > +static struct alg_exp_node *
>     > +expectation_lookup(struct hmap *alg_expectations,
>     > +                   const struct conn_key *key,
>     > +                   uint32_t basis)
>
>     OVS_REQ_RDLOCK(...) ?
>
> Yep, missed this one

There might be others, worth keeping an eye out.

>     > +{
>     > +    struct conn_key check_key = *key;
>     > +    check_key.src.port = ALG_WC_SRC_PORT;
>     > +    struct alg_exp_node *alg_exp_node;
>     > +    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
>     > +    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
>     > +                             alg_exp_conn_key_hash,
>     > +                             alg_expectations) {
>     > +        if (!memcmp(&alg_exp_node->key, &check_key,
>     > +                    sizeof alg_exp_node->key)) {
>     > +            return alg_exp_node;
>     > +        }
>     > +    }
>     > +
>     > +    return NULL;
>     > +}
>     > +
>     > +static void
>     > +expectation_create(struct conntrack *ct,
>     > +                   ovs_be16 dst_port,
>     > +                   const long long now,
>     > +                   enum ct_alg_mode mode,
>     > +                   const struct conn *master_conn)
>     > +{
>     > +    struct ct_addr src_addr;
>     > +    struct ct_addr dst_addr;
>     > +    struct ct_addr alg_nat_repl_addr;
>     > +
>     > +    switch (mode) {
>     > +    case CT_FTP_MODE_ACTIVE:
>     > +        src_addr = master_conn->rev_key.src.addr;
>     > +        dst_addr = master_conn->rev_key.dst.addr;
>     > +        alg_nat_repl_addr = master_conn->key.src.addr;
>     > +        break;
>     > +    case CT_FTP_MODE_PASSIVE:
>     > +        src_addr = master_conn->key.src.addr;
>     > +        dst_addr = master_conn->key.dst.addr;
>     > +        alg_nat_repl_addr = master_conn->rev_key.src.addr;
>     > +        break;
>     > +    default:
>     > +        OVS_NOT_REACHED();
>     > +    }
>     > +
>     > +    struct alg_exp_node *alg_exp_node =
>     > +        xzalloc(sizeof *alg_exp_node);
>     > +    alg_exp_node->key.dl_type = master_conn->key.dl_type;
>     > +    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
>     > +    alg_exp_node->key.zone = master_conn->key.zone;
>     > +    alg_exp_node->key.src.addr = src_addr;
>     > +    alg_exp_node->key.dst.addr = dst_addr;
>     > +    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
>     > +    alg_exp_node->key.dst.port = dst_port;
>     > +    alg_exp_node->master_mark = master_conn->mark;
>     > +    alg_exp_node->master_label = master_conn->label;
>     > +    alg_exp_node->master_key = master_conn->key;
>     > +    ct_rwlock_rdlock(&ct->resources_lock);
>     > +    struct alg_exp_node *alg_exp = expectation_lookup(
>     > +        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
>     > +    ct_rwlock_unlock(&ct->resources_lock);
>     > +    if (alg_exp) {
>     > +        free(alg_exp_node);
>     > +        return;
>     > +    }
>     > +
>     > +    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
>     > +    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
>     > +    uint32_t alg_exp_conn_key_hash =
>     > +        conn_key_hash(&alg_exp_node->key,
>     > +                      ct->hash_basis);
>     > +    ct_rwlock_wrlock(&ct->resources_lock);
>     > +    hmap_insert(&ct->alg_expectations,
>     > +                &alg_exp_node->node,
>     > +                alg_exp_conn_key_hash);
>     > +
>     > +    alg_exp_init_expiration(ct, alg_exp_node, now);
>     > +    ct_rwlock_unlock(&ct->resources_lock);
>
>     Is there a race where another thread inserts an equivalent expectation
>     between grabbing the readlock above and the writelock here?
>
> This can’t happen practically because of how packets are assigned to threads
> and how the protocols work.
> It is theoretically possible with a very sophisticated exploit; but this would also be a weak exploit.
> I had some other reasons to combine the critical sections however: the usual case is that the write
> lock will be needed anyways; might as well take it even though this code only handles a tiny
> subset of packets; also, readability is better.

OK.

>     >  int
>     > @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>     >          struct conn *conn, *next;
>     >
>     >          ct_lock_lock(&ct->buckets[i].lock);
>     > -        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
>     > +        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
>     >              if ((!zone || *zone == conn->key.zone) &&
>     >                  (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
>     >                  conn_clean(ct, conn, &ct->buckets[i]);
>     > @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>     >          }
>     >          ct_lock_unlock(&ct->buckets[i].lock);
>     >      }
>     > +
>     > +    ct_rwlock_wrlock(&ct->resources_lock);
>
>     Should this be nested within the bucket lock?
>
> No, expectations are protected by the resource lock. You
> later correctly pointed out that I should update the comment about
> the locking – I will do that.

Right. So long as there's no chance for something to acquire
ct_resources_lock then subsequently try to grab a bucket lock.

Thanks.
Darrell Ball June 26, 2017, 6:45 p.m. UTC | #4
On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:

    On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >

    >

    > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:

    >

    >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:

    >     > ALG infra and FTP (both V4 and V6) support is added to the userspace

    >     > datapath.  Also, NAT support is included.

    >     >

    >     > Signed-off-by: Darrell Ball <dlu998@gmail.com>

    >     > ---

    >

    >     Hi Darrell, thanks for the patch.

    >

    >     I wasn't able to test this out, because my build and test environments

    >     depend on sparse correctly running and it wouldn't compile. Please

    >     consider setting it up, or using Travis-CI to validate builds before

    >     v3.

    >

    > I see Travis found a sparse warning; strange, I always look for these

    > and still see none, I even changed the line you saw flagged

    >     > +    return v4_addr >> (index * 8) & 0xff;

    > and still no warning :-(. I usually even get extra spurious warnings.

    > However, from simple inspection right now, I would expect a “restricted type”

    > warning; it is not a real problem, but, of course I’ll fix it up

    
    OK, thanks!
    
    >     I didn't do a thorough review, but I scanned through and pointed out a

    >     couple of parts I have questions about.

    >

    >     >  lib/conntrack-private.h |   17 +

    >     >  lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----

    >     >  lib/conntrack.h         |    4 +-

    >     >  3 files changed, 957 insertions(+), 80 deletions(-)

    >     >

    >     > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h

    >     > index 55084d3..993af33 100644

    >     > --- a/lib/conntrack-private.h

    >     > +++ b/lib/conntrack-private.h

    >     > @@ -62,17 +62,34 @@ struct nat_conn_key_node {

    >     >      struct conn_key value;

    >     >  };

    >     >

    >     > +struct alg_exp_node {

    >     > +    struct hmap_node node;

    >     > +    struct ovs_list exp_node;

    >     > +    long long expiration;

    >     > +    struct conn_key key;

    >     > +    struct conn_key master_key;

    >     > +    struct ct_addr alg_nat_repl_addr;

    >     > +    ovs_u128 master_label;

    >     > +    uint32_t master_mark;

    >     > +};

    >     > +

    >     >  struct conn {

    >     >      struct conn_key key;

    >     >      struct conn_key rev_key;

    >     > +    /* Only used for orig_tuple support. */

    >     > +    struct conn_key master_key;

    >

    >     I guess no-one's looked at this structure from a cacheline

    >     perspective, but if we think it's worth optimizing then perhaps a

    >     followup patch should rearrange these. You might consider starting

    >     here by just placing all of the ALG-related fields together at the

    >     end.

    >

    > I intend some optimizations later, but it is outside the scope of this patch.

    > I placed the fields presently based on similar function for one field - master_conn

    > and the other fields based on padding considerations.

    > I leave as is for the time being and revisit later.

    
    Sure, sounds good.
    
    >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,

    >     >          nc->rev_key = nc->key;

    >     >          conn_key_reverse(&nc->rev_key);

    >     >

    >     > +        if (helper) {

    >     > +            nc->alg = xstrdup(helper);

    >     > +        }

    >     > +

    >     > +        if (alg_exp) {

    >     > +            nc->alg_related = true;

    >     > +            nc->mark = alg_exp->master_mark;

    >     > +            nc->label = alg_exp->master_label;

    >     > +            nc->master_key = alg_exp->master_key;

    >     > +        }

    >     > +

    >     >          if (nat_action_info) {

    >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);

    >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);

    >     > -

    >     > -            bool nat_res = nat_select_range_tuple(ct, nc,

    >     > -                                                  conn_for_un_nat_copy);

    >     >

    >     > -            if (!nat_res) {

    >     > -                free(nc->nat_info);

    >     > -                nc->nat_info = NULL;

    >     > -                free (nc);

    >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);

    >     > -                return NULL;

    >     > -            }

    >     > +            if (alg_exp) {

    >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;

    >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;

    >     > +                *conn_for_un_nat_copy = *nc;

    >     > +            } else {

    >     > +                ct_rwlock_wrlock(&ct->resources_lock);

    >     > +                bool nat_res = nat_select_range_tuple(

    >     > +                                   ct, nc, conn_for_un_nat_copy);

    >     > +

    >     > +                if (!nat_res) {

    >     > +                    free(nc->nat_info);

    >     > +                    nc->nat_info = NULL;

    >     > +                    free (nc);

    >

    >     I think that nc->alg may be leaked here? any reason it doesn't use

    >     delete_conn()?

    >

    > Good

    > Yes, alg will leak in this rare error case and yes, delete_conn() should be used

    > here, as everywhere.

    
    OK.
    
    >     > +                    ct_rwlock_unlock(&ct->resources_lock);

    >     > +                    return NULL;

    >     > +                }

    >     >

    >     > -            if (conn_for_un_nat_copy &&

    >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {

    >     >                  *nc = *conn_for_un_nat_copy;

    >

    >     Perhaps nc->alg and/or nc->nat_info may be leaked here?

    >

    > No, the un_nat conn has no such allocations, so there is nothing to leak.

    
    I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?

I see your question now.
No, at this point, the copy gets the same pointers to the alg string and nat_info.
Only nc needs them and the un_nat copy ptrs are nulled.
There is only one allocation set.

    
    >     <snip>

    >

    >     > @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,

    >     >          }

    >     >      }

    >     >

    >     > +#define MAX_ALG_EXP_TO_EXPIRE 1000

    >     > +    size_t alg_exp_count = hmap_count(&ct->alg_expectations);

    >     > +    /* XXX: revisit this. */

    >     > +    size_t max_to_expire =

    >     > +        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);

    >

    >     I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,

    >     not the lower bound.

    >

    > If the number of expectations is not large (common case), then this number represents the lower bound.

    > If the number of expectations is large, then it represents an upper bound.

    
    alg_exp_count=10, then MAX(1, 1000) -> expire up to 1000 entries.
    
    alg_exp_count=100000, then MAX(10000,1000) -> expire up to 10000 entries.
    
    So, MAX_ALG_EXP_TO_EXPIRE isn't a maximum?

I agree and was mulling over the same a few times; I will change
MAX_ALG_EXP_TO_EXPIRE
to 
DFT_ALG_EXP_TO_EXPIRE
    
    In practice it seems unlikely to end up in a situation with so many
    expectations, but still I found this a bit confusing.

    
    >     <snip>

    >

    >     > @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,

    >     >      }

    >     >  }

    >     >

    >     > +static struct alg_exp_node *

    >     > +expectation_lookup(struct hmap *alg_expectations,

    >     > +                   const struct conn_key *key,

    >     > +                   uint32_t basis)

    >

    >     OVS_REQ_RDLOCK(...) ?

    >

    > Yep, missed this one

    
    There might be others, worth keeping an eye out.

Yes, there are a few.
The reason I did not add them is that it requires an extra parameter for the lock
that is unused so the OVS_REQ can reference it; I might just add a comment and skip adding
the extra dummy parameter.
    
    >     > +{

    >     > +    struct conn_key check_key = *key;

    >     > +    check_key.src.port = ALG_WC_SRC_PORT;

    >     > +    struct alg_exp_node *alg_exp_node;

    >     > +    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);

    >     > +    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,

    >     > +                             alg_exp_conn_key_hash,

    >     > +                             alg_expectations) {

    >     > +        if (!memcmp(&alg_exp_node->key, &check_key,

    >     > +                    sizeof alg_exp_node->key)) {

    >     > +            return alg_exp_node;

    >     > +        }

    >     > +    }

    >     > +

    >     > +    return NULL;

    >     > +}

    >     > +

    >     > +static void

    >     > +expectation_create(struct conntrack *ct,

    >     > +                   ovs_be16 dst_port,

    >     > +                   const long long now,

    >     > +                   enum ct_alg_mode mode,

    >     > +                   const struct conn *master_conn)

    >     > +{

    >     > +    struct ct_addr src_addr;

    >     > +    struct ct_addr dst_addr;

    >     > +    struct ct_addr alg_nat_repl_addr;

    >     > +

    >     > +    switch (mode) {

    >     > +    case CT_FTP_MODE_ACTIVE:

    >     > +        src_addr = master_conn->rev_key.src.addr;

    >     > +        dst_addr = master_conn->rev_key.dst.addr;

    >     > +        alg_nat_repl_addr = master_conn->key.src.addr;

    >     > +        break;

    >     > +    case CT_FTP_MODE_PASSIVE:

    >     > +        src_addr = master_conn->key.src.addr;

    >     > +        dst_addr = master_conn->key.dst.addr;

    >     > +        alg_nat_repl_addr = master_conn->rev_key.src.addr;

    >     > +        break;

    >     > +    default:

    >     > +        OVS_NOT_REACHED();

    >     > +    }

    >     > +

    >     > +    struct alg_exp_node *alg_exp_node =

    >     > +        xzalloc(sizeof *alg_exp_node);

    >     > +    alg_exp_node->key.dl_type = master_conn->key.dl_type;

    >     > +    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;

    >     > +    alg_exp_node->key.zone = master_conn->key.zone;

    >     > +    alg_exp_node->key.src.addr = src_addr;

    >     > +    alg_exp_node->key.dst.addr = dst_addr;

    >     > +    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;

    >     > +    alg_exp_node->key.dst.port = dst_port;

    >     > +    alg_exp_node->master_mark = master_conn->mark;

    >     > +    alg_exp_node->master_label = master_conn->label;

    >     > +    alg_exp_node->master_key = master_conn->key;

    >     > +    ct_rwlock_rdlock(&ct->resources_lock);

    >     > +    struct alg_exp_node *alg_exp = expectation_lookup(

    >     > +        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);

    >     > +    ct_rwlock_unlock(&ct->resources_lock);

    >     > +    if (alg_exp) {

    >     > +        free(alg_exp_node);

    >     > +        return;

    >     > +    }

    >     > +

    >     > +    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;

    >     > +    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;

    >     > +    uint32_t alg_exp_conn_key_hash =

    >     > +        conn_key_hash(&alg_exp_node->key,

    >     > +                      ct->hash_basis);

    >     > +    ct_rwlock_wrlock(&ct->resources_lock);

    >     > +    hmap_insert(&ct->alg_expectations,

    >     > +                &alg_exp_node->node,

    >     > +                alg_exp_conn_key_hash);

    >     > +

    >     > +    alg_exp_init_expiration(ct, alg_exp_node, now);

    >     > +    ct_rwlock_unlock(&ct->resources_lock);

    >

    >     Is there a race where another thread inserts an equivalent expectation

    >     between grabbing the readlock above and the writelock here?

    >

    > This can’t happen practically because of how packets are assigned to threads

    > and how the protocols work.

    > It is theoretically possible with a very sophisticated exploit; but this would also be a weak exploit.

    > I had some other reasons to combine the critical sections however: the usual case is that the write

    > lock will be needed anyways; might as well take it even though this code only handles a tiny

    > subset of packets; also, readability is better.

    
    OK.
    
    >     >  int

    >     > @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)

    >     >          struct conn *conn, *next;

    >     >

    >     >          ct_lock_lock(&ct->buckets[i].lock);

    >     > -        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {

    >     > +        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {

    >     >              if ((!zone || *zone == conn->key.zone) &&

    >     >                  (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {

    >     >                  conn_clean(ct, conn, &ct->buckets[i]);

    >     > @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)

    >     >          }

    >     >          ct_lock_unlock(&ct->buckets[i].lock);

    >     >      }

    >     > +

    >     > +    ct_rwlock_wrlock(&ct->resources_lock);

    >

    >     Should this be nested within the bucket lock?

    >

    > No, expectations are protected by the resource lock. You

    > later correctly pointed out that I should update the comment about

    > the locking – I will do that.

    
    Right. So long as there's no chance for something to acquire
    ct_resources_lock then subsequently try to grab a bucket lock.
    
    Thanks.
Joe Stringer June 26, 2017, 8:22 p.m. UTC | #5
On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >     >          nc->rev_key = nc->key;
>     >     >          conn_key_reverse(&nc->rev_key);
>     >     >
>     >     > +        if (helper) {
>     >     > +            nc->alg = xstrdup(helper);
>     >     > +        }
>     >     > +
>     >     > +        if (alg_exp) {
>     >     > +            nc->alg_related = true;
>     >     > +            nc->mark = alg_exp->master_mark;
>     >     > +            nc->label = alg_exp->master_label;
>     >     > +            nc->master_key = alg_exp->master_key;
>     >     > +        }
>     >     > +
>     >     >          if (nat_action_info) {
>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >     > -
>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >     > -                                                  conn_for_un_nat_copy);
>     >     >
>     >     > -            if (!nat_res) {
>     >     > -                free(nc->nat_info);
>     >     > -                nc->nat_info = NULL;
>     >     > -                free (nc);
>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >     > -                return NULL;
>     >     > -            }
>     >     > +            if (alg_exp) {
>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     > +                *conn_for_un_nat_copy = *nc;
>     >     > +            } else {
>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >     > +                bool nat_res = nat_select_range_tuple(
>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >     > +
>     >     > +                if (!nat_res) {
>     >     > +                    free(nc->nat_info);
>     >     > +                    nc->nat_info = NULL;
>     >     > +                    free (nc);
>     >
>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >     delete_conn()?
>     >
>     > Good
>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     > here, as everywhere.
>
>     OK.
>
>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >     > +                    return NULL;
>     >     > +                }
>     >     >
>     >     > -            if (conn_for_un_nat_copy &&
>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >     >                  *nc = *conn_for_un_nat_copy;
>     >
>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >
>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>
>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>
> I see your question now.
> No, at this point, the copy gets the same pointers to the alg string and nat_info.
> Only nc needs them and the un_nat copy ptrs are nulled.
> There is only one allocation set.

Hmm. Maybe I'm just missing something, let me walk through it step by
step below and let's see where it goes.

       if (helper) {
           nc->alg = xstrdup(helper);
^ nc->alg is set
       }

       if (alg_exp) {
^ false; do not execute this block
           nc->alg_related = true;
           nc->mark = alg_exp->master_mark;
           nc->label = alg_exp->master_label;
           nc->master_key = alg_exp->master_key;
       }

       if (nat_action_info) {
^ true, execute this part
           nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);

           if (alg_exp) {
^ false; skip to else
               nc->rev_key.src.addr = alg_nat_repl_addr;
               nc->nat_info->nat_action = NAT_ACTION_DST;
               *conn_for_un_nat_copy = *nc;
           } else {
^ We go through this condition
               ct_rwlock_wrlock(&ct->resources_lock);
               bool nat_res = nat_select_range_tuple(
                                  ct, nc, conn_for_un_nat_copy);

               if (!nat_res) {
^ false; do not execute this block
                   free(nc->nat_info);
                   nc->nat_info = NULL;
                   free (nc);
                   ct_rwlock_unlock(&ct->resources_lock);
                   return NULL;
               }

               *nc = *conn_for_un_nat_copy;
^ Now:
nc->alg is overwritten by conn_for_un_nat_copy->alg
nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info

We don't free either of these.
Joe Stringer June 26, 2017, 11:49 p.m. UTC | #6
On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
> On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>>     >     >          nc->rev_key = nc->key;
>>     >     >          conn_key_reverse(&nc->rev_key);
>>     >     >
>>     >     > +        if (helper) {
>>     >     > +            nc->alg = xstrdup(helper);
>>     >     > +        }
>>     >     > +
>>     >     > +        if (alg_exp) {
>>     >     > +            nc->alg_related = true;
>>     >     > +            nc->mark = alg_exp->master_mark;
>>     >     > +            nc->label = alg_exp->master_label;
>>     >     > +            nc->master_key = alg_exp->master_key;
>>     >     > +        }
>>     >     > +
>>     >     >          if (nat_action_info) {
>>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>>     >     > -
>>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>>     >     > -                                                  conn_for_un_nat_copy);
>>     >     >
>>     >     > -            if (!nat_res) {
>>     >     > -                free(nc->nat_info);
>>     >     > -                nc->nat_info = NULL;
>>     >     > -                free (nc);
>>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>>     >     > -                return NULL;
>>     >     > -            }
>>     >     > +            if (alg_exp) {
>>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>>     >     > +                *conn_for_un_nat_copy = *nc;
>>     >     > +            } else {
>>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>>     >     > +                bool nat_res = nat_select_range_tuple(
>>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>>     >     > +
>>     >     > +                if (!nat_res) {
>>     >     > +                    free(nc->nat_info);
>>     >     > +                    nc->nat_info = NULL;
>>     >     > +                    free (nc);
>>     >
>>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>>     >     delete_conn()?
>>     >
>>     > Good
>>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>>     > here, as everywhere.
>>
>>     OK.
>>
>>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>>     >     > +                    return NULL;
>>     >     > +                }
>>     >     >
>>     >     > -            if (conn_for_un_nat_copy &&
>>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>>     >     >                  *nc = *conn_for_un_nat_copy;
>>     >
>>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>>     >
>>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>>
>>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>>
>> I see your question now.
>> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>> Only nc needs them and the un_nat copy ptrs are nulled.
>> There is only one allocation set.
>
> Hmm. Maybe I'm just missing something, let me walk through it step by
> step below and let's see where it goes.
>
>        if (helper) {
>            nc->alg = xstrdup(helper);
> ^ nc->alg is set
>        }
>
>        if (alg_exp) {
> ^ false; do not execute this block
>            nc->alg_related = true;
>            nc->mark = alg_exp->master_mark;
>            nc->label = alg_exp->master_label;
>            nc->master_key = alg_exp->master_key;
>        }
>
>        if (nat_action_info) {
> ^ true, execute this part
>            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>
>            if (alg_exp) {
> ^ false; skip to else
>                nc->rev_key.src.addr = alg_nat_repl_addr;
>                nc->nat_info->nat_action = NAT_ACTION_DST;
>                *conn_for_un_nat_copy = *nc;
>            } else {
> ^ We go through this condition
>                ct_rwlock_wrlock(&ct->resources_lock);
>                bool nat_res = nat_select_range_tuple(
>                                   ct, nc, conn_for_un_nat_copy);
>
>                if (!nat_res) {
> ^ false; do not execute this block
>                    free(nc->nat_info);
>                    nc->nat_info = NULL;
>                    free (nc);
>                    ct_rwlock_unlock(&ct->resources_lock);
>                    return NULL;
>                }
>
>                *nc = *conn_for_un_nat_copy;
> ^ Now:
> nc->alg is overwritten by conn_for_un_nat_copy->alg
> nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>
> We don't free either of these.

As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
nested inside nat_select_range_tuple() is very well hidden. This means
that the above is not a problem... but what if (!nat_res) ? Then
conn_for_un_nat_copy() has a reference to these alg/nat_info
parameters which are freed from 'nc' inside that block, then
'conn_for_un_nat_copy' is returned. Could there be a use-after-free
then?
Darrell Ball June 27, 2017, 1:19 a.m. UTC | #7
On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:

    On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >>     >     >          nc->rev_key = nc->key;
    >>     >     >          conn_key_reverse(&nc->rev_key);
    >>     >     >
    >>     >     > +        if (helper) {
    >>     >     > +            nc->alg = xstrdup(helper);
    >>     >     > +        }
    >>     >     > +
    >>     >     > +        if (alg_exp) {
    >>     >     > +            nc->alg_related = true;
    >>     >     > +            nc->mark = alg_exp->master_mark;
    >>     >     > +            nc->label = alg_exp->master_label;
    >>     >     > +            nc->master_key = alg_exp->master_key;
    >>     >     > +        }
    >>     >     > +
    >>     >     >          if (nat_action_info) {
    >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >>     >     > -
    >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >>     >     > -                                                  conn_for_un_nat_copy);
    >>     >     >
    >>     >     > -            if (!nat_res) {
    >>     >     > -                free(nc->nat_info);
    >>     >     > -                nc->nat_info = NULL;
    >>     >     > -                free (nc);
    >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >>     >     > -                return NULL;
    >>     >     > -            }
    >>     >     > +            if (alg_exp) {
    >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >>     >     > +                *conn_for_un_nat_copy = *nc;
    >>     >     > +            } else {
    >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >>     >     > +                bool nat_res = nat_select_range_tuple(
    >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >>     >     > +
    >>     >     > +                if (!nat_res) {
    >>     >     > +                    free(nc->nat_info);
    >>     >     > +                    nc->nat_info = NULL;
    >>     >     > +                    free (nc);
    >>     >
    >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >>     >     delete_conn()?
    >>     >
    >>     > Good
    >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >>     > here, as everywhere.
    >>
    >>     OK.
    >>
    >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >>     >     > +                    return NULL;
    >>     >     > +                }
    >>     >     >
    >>     >     > -            if (conn_for_un_nat_copy &&
    >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >>     >     >                  *nc = *conn_for_un_nat_copy;
    >>     >
    >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >>     >
    >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >>
    >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >>
    >> I see your question now.
    >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >> Only nc needs them and the un_nat copy ptrs are nulled.
    >> There is only one allocation set.
    >
    > Hmm. Maybe I'm just missing something, let me walk through it step by
    > step below and let's see where it goes.
    >
    >        if (helper) {
    >            nc->alg = xstrdup(helper);
    > ^ nc->alg is set
    >        }
    >
    >        if (alg_exp) {
    > ^ false; do not execute this block
    >            nc->alg_related = true;
    >            nc->mark = alg_exp->master_mark;
    >            nc->label = alg_exp->master_label;
    >            nc->master_key = alg_exp->master_key;
    >        }
    >
    >        if (nat_action_info) {
    > ^ true, execute this part
    >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >
    >            if (alg_exp) {
    > ^ false; skip to else
    >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >                *conn_for_un_nat_copy = *nc;
    >            } else {
    > ^ We go through this condition
    >                ct_rwlock_wrlock(&ct->resources_lock);
    >                bool nat_res = nat_select_range_tuple(
    >                                   ct, nc, conn_for_un_nat_copy);
    >
    >                if (!nat_res) {
    > ^ false; do not execute this block
    >                    free(nc->nat_info);
    >                    nc->nat_info = NULL;
    >                    free (nc);
    >                    ct_rwlock_unlock(&ct->resources_lock);
    >                    return NULL;
    >                }
    >
    >                *nc = *conn_for_un_nat_copy;
    > ^ Now:
    > nc->alg is overwritten by conn_for_un_nat_copy->alg
    > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >
    > We don't free either of these.
    
    As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    nested inside nat_select_range_tuple() is very well hidden. This means
    that the above is not a problem... but what if (!nat_res) ? Then
    conn_for_un_nat_copy() has a reference to these alg/nat_info
    parameters which are freed from 'nc' inside that block, then
    'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    then?

Nope, because there is no un_nat conn.
Joe Stringer June 27, 2017, 5:40 p.m. UTC | #8
On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
>
>     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
>     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >>     >     >          nc->rev_key = nc->key;
>     >>     >     >          conn_key_reverse(&nc->rev_key);
>     >>     >     >
>     >>     >     > +        if (helper) {
>     >>     >     > +            nc->alg = xstrdup(helper);
>     >>     >     > +        }
>     >>     >     > +
>     >>     >     > +        if (alg_exp) {
>     >>     >     > +            nc->alg_related = true;
>     >>     >     > +            nc->mark = alg_exp->master_mark;
>     >>     >     > +            nc->label = alg_exp->master_label;
>     >>     >     > +            nc->master_key = alg_exp->master_key;
>     >>     >     > +        }
>     >>     >     > +
>     >>     >     >          if (nat_action_info) {
>     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >>     >     > -
>     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >>     >     > -                                                  conn_for_un_nat_copy);
>     >>     >     >
>     >>     >     > -            if (!nat_res) {
>     >>     >     > -                free(nc->nat_info);
>     >>     >     > -                nc->nat_info = NULL;
>     >>     >     > -                free (nc);
>     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >>     >     > -                return NULL;
>     >>     >     > -            }
>     >>     >     > +            if (alg_exp) {
>     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >>     >     > +                *conn_for_un_nat_copy = *nc;
>     >>     >     > +            } else {
>     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >>     >     > +                bool nat_res = nat_select_range_tuple(
>     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >>     >     > +
>     >>     >     > +                if (!nat_res) {
>     >>     >     > +                    free(nc->nat_info);
>     >>     >     > +                    nc->nat_info = NULL;
>     >>     >     > +                    free (nc);
>     >>     >
>     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >>     >     delete_conn()?
>     >>     >
>     >>     > Good
>     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     >>     > here, as everywhere.
>     >>
>     >>     OK.
>     >>
>     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >>     >     > +                    return NULL;
>     >>     >     > +                }
>     >>     >     >
>     >>     >     > -            if (conn_for_un_nat_copy &&
>     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >>     >     >                  *nc = *conn_for_un_nat_copy;
>     >>     >
>     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >>     >
>     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>     >>
>     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>     >>
>     >> I see your question now.
>     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>     >> Only nc needs them and the un_nat copy ptrs are nulled.
>     >> There is only one allocation set.
>     >
>     > Hmm. Maybe I'm just missing something, let me walk through it step by
>     > step below and let's see where it goes.
>     >
>     >        if (helper) {
>     >            nc->alg = xstrdup(helper);
>     > ^ nc->alg is set
>     >        }
>     >
>     >        if (alg_exp) {
>     > ^ false; do not execute this block
>     >            nc->alg_related = true;
>     >            nc->mark = alg_exp->master_mark;
>     >            nc->label = alg_exp->master_label;
>     >            nc->master_key = alg_exp->master_key;
>     >        }
>     >
>     >        if (nat_action_info) {
>     > ^ true, execute this part
>     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >
>     >            if (alg_exp) {
>     > ^ false; skip to else
>     >                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >                *conn_for_un_nat_copy = *nc;
>     >            } else {
>     > ^ We go through this condition
>     >                ct_rwlock_wrlock(&ct->resources_lock);
>     >                bool nat_res = nat_select_range_tuple(
>     >                                   ct, nc, conn_for_un_nat_copy);
>     >
>     >                if (!nat_res) {
>     > ^ false; do not execute this block
>     >                    free(nc->nat_info);
>     >                    nc->nat_info = NULL;
>     >                    free (nc);
>     >                    ct_rwlock_unlock(&ct->resources_lock);
>     >                    return NULL;
>     >                }
>     >
>     >                *nc = *conn_for_un_nat_copy;
>     > ^ Now:
>     > nc->alg is overwritten by conn_for_un_nat_copy->alg
>     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>     >
>     > We don't free either of these.
>
>     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
>     nested inside nat_select_range_tuple() is very well hidden. This means
>     that the above is not a problem... but what if (!nat_res) ? Then
>     conn_for_un_nat_copy() has a reference to these alg/nat_info
>     parameters which are freed from 'nc' inside that block, then
>     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
>     then?
>
> Nope, because there is no un_nat conn.

So you mean that dangling references are returned inside
conn_for_un_nat_copy but they're just not used?
Darrell Ball June 27, 2017, 5:42 p.m. UTC | #9
On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:

    On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
    >
    >
    > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >
    >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >     >>     >     >          nc->rev_key = nc->key;
    >     >>     >     >          conn_key_reverse(&nc->rev_key);
    >     >>     >     >
    >     >>     >     > +        if (helper) {
    >     >>     >     > +            nc->alg = xstrdup(helper);
    >     >>     >     > +        }
    >     >>     >     > +
    >     >>     >     > +        if (alg_exp) {
    >     >>     >     > +            nc->alg_related = true;
    >     >>     >     > +            nc->mark = alg_exp->master_mark;
    >     >>     >     > +            nc->label = alg_exp->master_label;
    >     >>     >     > +            nc->master_key = alg_exp->master_key;
    >     >>     >     > +        }
    >     >>     >     > +
    >     >>     >     >          if (nat_action_info) {
    >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >     >>     >     > -
    >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >     >>     >     > -                                                  conn_for_un_nat_copy);
    >     >>     >     >
    >     >>     >     > -            if (!nat_res) {
    >     >>     >     > -                free(nc->nat_info);
    >     >>     >     > -                nc->nat_info = NULL;
    >     >>     >     > -                free (nc);
    >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >     >>     >     > -                return NULL;
    >     >>     >     > -            }
    >     >>     >     > +            if (alg_exp) {
    >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >>     >     > +                *conn_for_un_nat_copy = *nc;
    >     >>     >     > +            } else {
    >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >     >>     >     > +                bool nat_res = nat_select_range_tuple(
    >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >     >>     >     > +
    >     >>     >     > +                if (!nat_res) {
    >     >>     >     > +                    free(nc->nat_info);
    >     >>     >     > +                    nc->nat_info = NULL;
    >     >>     >     > +                    free (nc);
    >     >>     >
    >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >     >>     >     delete_conn()?
    >     >>     >
    >     >>     > Good
    >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >     >>     > here, as everywhere.
    >     >>
    >     >>     OK.
    >     >>
    >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >     >>     >     > +                    return NULL;
    >     >>     >     > +                }
    >     >>     >     >
    >     >>     >     > -            if (conn_for_un_nat_copy &&
    >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >     >>     >     >                  *nc = *conn_for_un_nat_copy;
    >     >>     >
    >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >     >>     >
    >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >     >>
    >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >     >>
    >     >> I see your question now.
    >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >     >> Only nc needs them and the un_nat copy ptrs are nulled.
    >     >> There is only one allocation set.
    >     >
    >     > Hmm. Maybe I'm just missing something, let me walk through it step by
    >     > step below and let's see where it goes.
    >     >
    >     >        if (helper) {
    >     >            nc->alg = xstrdup(helper);
    >     > ^ nc->alg is set
    >     >        }
    >     >
    >     >        if (alg_exp) {
    >     > ^ false; do not execute this block
    >     >            nc->alg_related = true;
    >     >            nc->mark = alg_exp->master_mark;
    >     >            nc->label = alg_exp->master_label;
    >     >            nc->master_key = alg_exp->master_key;
    >     >        }
    >     >
    >     >        if (nat_action_info) {
    >     > ^ true, execute this part
    >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >
    >     >            if (alg_exp) {
    >     > ^ false; skip to else
    >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >                *conn_for_un_nat_copy = *nc;
    >     >            } else {
    >     > ^ We go through this condition
    >     >                ct_rwlock_wrlock(&ct->resources_lock);
    >     >                bool nat_res = nat_select_range_tuple(
    >     >                                   ct, nc, conn_for_un_nat_copy);
    >     >
    >     >                if (!nat_res) {
    >     > ^ false; do not execute this block
    >     >                    free(nc->nat_info);
    >     >                    nc->nat_info = NULL;
    >     >                    free (nc);
    >     >                    ct_rwlock_unlock(&ct->resources_lock);
    >     >                    return NULL;
    >     >                }
    >     >
    >     >                *nc = *conn_for_un_nat_copy;
    >     > ^ Now:
    >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
    >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >     >
    >     > We don't free either of these.
    >
    >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    >     nested inside nat_select_range_tuple() is very well hidden. This means
    >     that the above is not a problem... but what if (!nat_res) ? Then
    >     conn_for_un_nat_copy() has a reference to these alg/nat_info
    >     parameters which are freed from 'nc' inside that block, then
    >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    >     then?
    >
    > Nope, because there is no un_nat conn.
    
    So you mean that dangling references are returned inside
    conn_for_un_nat_copy but they're just not used?

JTBC, nothing can be used, since there is not even a connection.
Joe Stringer June 27, 2017, 5:47 p.m. UTC | #10
On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
>
>     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
>     >
>     >
>     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
>     >
>     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
>     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >     >>     >     >          nc->rev_key = nc->key;
>     >     >>     >     >          conn_key_reverse(&nc->rev_key);
>     >     >>     >     >
>     >     >>     >     > +        if (helper) {
>     >     >>     >     > +            nc->alg = xstrdup(helper);
>     >     >>     >     > +        }
>     >     >>     >     > +
>     >     >>     >     > +        if (alg_exp) {
>     >     >>     >     > +            nc->alg_related = true;
>     >     >>     >     > +            nc->mark = alg_exp->master_mark;
>     >     >>     >     > +            nc->label = alg_exp->master_label;
>     >     >>     >     > +            nc->master_key = alg_exp->master_key;
>     >     >>     >     > +        }
>     >     >>     >     > +
>     >     >>     >     >          if (nat_action_info) {
>     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >     >>     >     > -
>     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >     >>     >     > -                                                  conn_for_un_nat_copy);
>     >     >>     >     >
>     >     >>     >     > -            if (!nat_res) {
>     >     >>     >     > -                free(nc->nat_info);
>     >     >>     >     > -                nc->nat_info = NULL;
>     >     >>     >     > -                free (nc);
>     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >     >>     >     > -                return NULL;
>     >     >>     >     > -            }
>     >     >>     >     > +            if (alg_exp) {
>     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
>     >     >>     >     > +            } else {
>     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
>     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >     >>     >     > +
>     >     >>     >     > +                if (!nat_res) {
>     >     >>     >     > +                    free(nc->nat_info);
>     >     >>     >     > +                    nc->nat_info = NULL;
>     >     >>     >     > +                    free (nc);
>     >     >>     >
>     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >     >>     >     delete_conn()?
>     >     >>     >
>     >     >>     > Good
>     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     >     >>     > here, as everywhere.
>     >     >>
>     >     >>     OK.
>     >     >>
>     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >>     >     > +                    return NULL;
>     >     >>     >     > +                }
>     >     >>     >     >
>     >     >>     >     > -            if (conn_for_un_nat_copy &&
>     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
>     >     >>     >
>     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >     >>     >
>     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>     >     >>
>     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>     >     >>
>     >     >> I see your question now.
>     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
>     >     >> There is only one allocation set.
>     >     >
>     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
>     >     > step below and let's see where it goes.
>     >     >
>     >     >        if (helper) {
>     >     >            nc->alg = xstrdup(helper);
>     >     > ^ nc->alg is set
>     >     >        }
>     >     >
>     >     >        if (alg_exp) {
>     >     > ^ false; do not execute this block
>     >     >            nc->alg_related = true;
>     >     >            nc->mark = alg_exp->master_mark;
>     >     >            nc->label = alg_exp->master_label;
>     >     >            nc->master_key = alg_exp->master_key;
>     >     >        }
>     >     >
>     >     >        if (nat_action_info) {
>     >     > ^ true, execute this part
>     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >
>     >     >            if (alg_exp) {
>     >     > ^ false; skip to else
>     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >                *conn_for_un_nat_copy = *nc;
>     >     >            } else {
>     >     > ^ We go through this condition
>     >     >                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >                bool nat_res = nat_select_range_tuple(
>     >     >                                   ct, nc, conn_for_un_nat_copy);
>     >     >
>     >     >                if (!nat_res) {
>     >     > ^ false; do not execute this block
>     >     >                    free(nc->nat_info);
>     >     >                    nc->nat_info = NULL;
>     >     >                    free (nc);
>     >     >                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >                    return NULL;
>     >     >                }
>     >     >
>     >     >                *nc = *conn_for_un_nat_copy;
>     >     > ^ Now:
>     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
>     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>     >     >
>     >     > We don't free either of these.
>     >
>     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
>     >     nested inside nat_select_range_tuple() is very well hidden. This means
>     >     that the above is not a problem... but what if (!nat_res) ? Then
>     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
>     >     parameters which are freed from 'nc' inside that block, then
>     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
>     >     then?
>     >
>     > Nope, because there is no un_nat conn.
>
>     So you mean that dangling references are returned inside
>     conn_for_un_nat_copy but they're just not used?
>
> JTBC, nothing can be used, since there is not even a connection.

So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
*conn;" line which updates conn_for_un_nat_copy, then
nat_select_range_tuple() fails to select a tuple and returns false,
then the if (!nat_res) cleanup / return NULL path is called, then what
cleans up the dangling pointers that are now in conn_for_un_nat_copy?
Darrell Ball June 27, 2017, 5:51 p.m. UTC | #11
On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:

    On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
    >
    >
    > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >
    >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
    >     >
    >     >
    >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >
    >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >     >     >>     >     >          nc->rev_key = nc->key;
    >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
    >     >     >>     >     >
    >     >     >>     >     > +        if (helper) {
    >     >     >>     >     > +            nc->alg = xstrdup(helper);
    >     >     >>     >     > +        }
    >     >     >>     >     > +
    >     >     >>     >     > +        if (alg_exp) {
    >     >     >>     >     > +            nc->alg_related = true;
    >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
    >     >     >>     >     > +            nc->label = alg_exp->master_label;
    >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
    >     >     >>     >     > +        }
    >     >     >>     >     > +
    >     >     >>     >     >          if (nat_action_info) {
    >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >     >     >>     >     > -
    >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >     >     >>     >     > -                                                  conn_for_un_nat_copy);
    >     >     >>     >     >
    >     >     >>     >     > -            if (!nat_res) {
    >     >     >>     >     > -                free(nc->nat_info);
    >     >     >>     >     > -                nc->nat_info = NULL;
    >     >     >>     >     > -                free (nc);
    >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >     >     >>     >     > -                return NULL;
    >     >     >>     >     > -            }
    >     >     >>     >     > +            if (alg_exp) {
    >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
    >     >     >>     >     > +            } else {
    >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
    >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >     >     >>     >     > +
    >     >     >>     >     > +                if (!nat_res) {
    >     >     >>     >     > +                    free(nc->nat_info);
    >     >     >>     >     > +                    nc->nat_info = NULL;
    >     >     >>     >     > +                    free (nc);
    >     >     >>     >
    >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >     >     >>     >     delete_conn()?
    >     >     >>     >
    >     >     >>     > Good
    >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >     >     >>     > here, as everywhere.
    >     >     >>
    >     >     >>     OK.
    >     >     >>
    >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >>     >     > +                    return NULL;
    >     >     >>     >     > +                }
    >     >     >>     >     >
    >     >     >>     >     > -            if (conn_for_un_nat_copy &&
    >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
    >     >     >>     >
    >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >     >     >>     >
    >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >     >     >>
    >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >     >     >>
    >     >     >> I see your question now.
    >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
    >     >     >> There is only one allocation set.
    >     >     >
    >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
    >     >     > step below and let's see where it goes.
    >     >     >
    >     >     >        if (helper) {
    >     >     >            nc->alg = xstrdup(helper);
    >     >     > ^ nc->alg is set
    >     >     >        }
    >     >     >
    >     >     >        if (alg_exp) {
    >     >     > ^ false; do not execute this block
    >     >     >            nc->alg_related = true;
    >     >     >            nc->mark = alg_exp->master_mark;
    >     >     >            nc->label = alg_exp->master_label;
    >     >     >            nc->master_key = alg_exp->master_key;
    >     >     >        }
    >     >     >
    >     >     >        if (nat_action_info) {
    >     >     > ^ true, execute this part
    >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >
    >     >     >            if (alg_exp) {
    >     >     > ^ false; skip to else
    >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >                *conn_for_un_nat_copy = *nc;
    >     >     >            } else {
    >     >     > ^ We go through this condition
    >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >                bool nat_res = nat_select_range_tuple(
    >     >     >                                   ct, nc, conn_for_un_nat_copy);
    >     >     >
    >     >     >                if (!nat_res) {
    >     >     > ^ false; do not execute this block
    >     >     >                    free(nc->nat_info);
    >     >     >                    nc->nat_info = NULL;
    >     >     >                    free (nc);
    >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >                    return NULL;
    >     >     >                }
    >     >     >
    >     >     >                *nc = *conn_for_un_nat_copy;
    >     >     > ^ Now:
    >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
    >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >     >     >
    >     >     > We don't free either of these.
    >     >
    >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    >     >     nested inside nat_select_range_tuple() is very well hidden. This means
    >     >     that the above is not a problem... but what if (!nat_res) ? Then
    >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
    >     >     parameters which are freed from 'nc' inside that block, then
    >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    >     >     then?
    >     >
    >     > Nope, because there is no un_nat conn.
    >
    >     So you mean that dangling references are returned inside
    >     conn_for_un_nat_copy but they're just not used?
    >
    > JTBC, nothing can be used, since there is not even a connection.
    
    So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
    *conn;" line which updates conn_for_un_nat_copy, then
    nat_select_range_tuple() fails to select a tuple and returns false,
    then the if (!nat_res) cleanup / return NULL path is called, then what
    cleans up the dangling pointers that are now in conn_for_un_nat_copy?

conn_for_un_nat_copy is a local variable in the caller
Joe Stringer June 27, 2017, 6:04 p.m. UTC | #12
On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
>
>     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
>     >
>     >
>     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >
>     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
>     >     >
>     >     >
>     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >
>     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
>     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >     >     >>     >     >          nc->rev_key = nc->key;
>     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
>     >     >     >>     >     >
>     >     >     >>     >     > +        if (helper) {
>     >     >     >>     >     > +            nc->alg = xstrdup(helper);
>     >     >     >>     >     > +        }
>     >     >     >>     >     > +
>     >     >     >>     >     > +        if (alg_exp) {
>     >     >     >>     >     > +            nc->alg_related = true;
>     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
>     >     >     >>     >     > +            nc->label = alg_exp->master_label;
>     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
>     >     >     >>     >     > +        }
>     >     >     >>     >     > +
>     >     >     >>     >     >          if (nat_action_info) {
>     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >     >     >>     >     > -
>     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
>     >     >     >>     >     >
>     >     >     >>     >     > -            if (!nat_res) {
>     >     >     >>     >     > -                free(nc->nat_info);
>     >     >     >>     >     > -                nc->nat_info = NULL;
>     >     >     >>     >     > -                free (nc);
>     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >     >     >>     >     > -                return NULL;
>     >     >     >>     >     > -            }
>     >     >     >>     >     > +            if (alg_exp) {
>     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
>     >     >     >>     >     > +            } else {
>     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
>     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >>     >     > +
>     >     >     >>     >     > +                if (!nat_res) {
>     >     >     >>     >     > +                    free(nc->nat_info);
>     >     >     >>     >     > +                    nc->nat_info = NULL;
>     >     >     >>     >     > +                    free (nc);
>     >     >     >>     >
>     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >     >     >>     >     delete_conn()?
>     >     >     >>     >
>     >     >     >>     > Good
>     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     >     >     >>     > here, as everywhere.
>     >     >     >>
>     >     >     >>     OK.
>     >     >     >>
>     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >>     >     > +                    return NULL;
>     >     >     >>     >     > +                }
>     >     >     >>     >     >
>     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
>     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
>     >     >     >>     >
>     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >     >     >>     >
>     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>     >     >     >>
>     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>     >     >     >>
>     >     >     >> I see your question now.
>     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
>     >     >     >> There is only one allocation set.
>     >     >     >
>     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
>     >     >     > step below and let's see where it goes.
>     >     >     >
>     >     >     >        if (helper) {
>     >     >     >            nc->alg = xstrdup(helper);
>     >     >     > ^ nc->alg is set
>     >     >     >        }
>     >     >     >
>     >     >     >        if (alg_exp) {
>     >     >     > ^ false; do not execute this block
>     >     >     >            nc->alg_related = true;
>     >     >     >            nc->mark = alg_exp->master_mark;
>     >     >     >            nc->label = alg_exp->master_label;
>     >     >     >            nc->master_key = alg_exp->master_key;
>     >     >     >        }
>     >     >     >
>     >     >     >        if (nat_action_info) {
>     >     >     > ^ true, execute this part
>     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >
>     >     >     >            if (alg_exp) {
>     >     >     > ^ false; skip to else
>     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >                *conn_for_un_nat_copy = *nc;
>     >     >     >            } else {
>     >     >     > ^ We go through this condition
>     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >                bool nat_res = nat_select_range_tuple(
>     >     >     >                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >
>     >     >     >                if (!nat_res) {
>     >     >     > ^ false; do not execute this block
>     >     >     >                    free(nc->nat_info);
>     >     >     >                    nc->nat_info = NULL;
>     >     >     >                    free (nc);
>     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >                    return NULL;
>     >     >     >                }
>     >     >     >
>     >     >     >                *nc = *conn_for_un_nat_copy;
>     >     >     > ^ Now:
>     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
>     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>     >     >     >
>     >     >     > We don't free either of these.
>     >     >
>     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
>     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
>     >     >     that the above is not a problem... but what if (!nat_res) ? Then
>     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
>     >     >     parameters which are freed from 'nc' inside that block, then
>     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
>     >     >     then?
>     >     >
>     >     > Nope, because there is no un_nat conn.
>     >
>     >     So you mean that dangling references are returned inside
>     >     conn_for_un_nat_copy but they're just not used?
>     >
>     > JTBC, nothing can be used, since there is not even a connection.
>
>     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
>     *conn;" line which updates conn_for_un_nat_copy, then
>     nat_select_range_tuple() fails to select a tuple and returns false,
>     then the if (!nat_res) cleanup / return NULL path is called, then what
>     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
>
> conn_for_un_nat_copy is a local variable in the caller

Which is then potentially passed to another function,
create_un_nat_conn(), which then does an xmemdup() to propagate those
dangling pointers further.. seems like it would be safest to clear out
the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
'nc' in that path.
Darrell Ball June 27, 2017, 6:19 p.m. UTC | #13
On 6/27/17, 11:04 AM, "Joe Stringer" <joe@ovn.org> wrote:

    On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
    >
    >
    > On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >
    >     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
    >     >
    >     >
    >     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >
    >     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
    >     >     >
    >     >     >
    >     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >
    >     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    >     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >     >     >     >>     >     >          nc->rev_key = nc->key;
    >     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
    >     >     >     >>     >     >
    >     >     >     >>     >     > +        if (helper) {
    >     >     >     >>     >     > +            nc->alg = xstrdup(helper);
    >     >     >     >>     >     > +        }
    >     >     >     >>     >     > +
    >     >     >     >>     >     > +        if (alg_exp) {
    >     >     >     >>     >     > +            nc->alg_related = true;
    >     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
    >     >     >     >>     >     > +            nc->label = alg_exp->master_label;
    >     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
    >     >     >     >>     >     > +        }
    >     >     >     >>     >     > +
    >     >     >     >>     >     >          if (nat_action_info) {
    >     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >     >     >     >>     >     > -
    >     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
    >     >     >     >>     >     >
    >     >     >     >>     >     > -            if (!nat_res) {
    >     >     >     >>     >     > -                free(nc->nat_info);
    >     >     >     >>     >     > -                nc->nat_info = NULL;
    >     >     >     >>     >     > -                free (nc);
    >     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >     >     >     >>     >     > -                return NULL;
    >     >     >     >>     >     > -            }
    >     >     >     >>     >     > +            if (alg_exp) {
    >     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
    >     >     >     >>     >     > +            } else {
    >     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
    >     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >>     >     > +
    >     >     >     >>     >     > +                if (!nat_res) {
    >     >     >     >>     >     > +                    free(nc->nat_info);
    >     >     >     >>     >     > +                    nc->nat_info = NULL;
    >     >     >     >>     >     > +                    free (nc);
    >     >     >     >>     >
    >     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >     >     >     >>     >     delete_conn()?
    >     >     >     >>     >
    >     >     >     >>     > Good
    >     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >     >     >     >>     > here, as everywhere.
    >     >     >     >>
    >     >     >     >>     OK.
    >     >     >     >>
    >     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >>     >     > +                    return NULL;
    >     >     >     >>     >     > +                }
    >     >     >     >>     >     >
    >     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
    >     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
    >     >     >     >>     >
    >     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >     >     >     >>     >
    >     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >     >     >     >>
    >     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >     >     >     >>
    >     >     >     >> I see your question now.
    >     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
    >     >     >     >> There is only one allocation set.
    >     >     >     >
    >     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
    >     >     >     > step below and let's see where it goes.
    >     >     >     >
    >     >     >     >        if (helper) {
    >     >     >     >            nc->alg = xstrdup(helper);
    >     >     >     > ^ nc->alg is set
    >     >     >     >        }
    >     >     >     >
    >     >     >     >        if (alg_exp) {
    >     >     >     > ^ false; do not execute this block
    >     >     >     >            nc->alg_related = true;
    >     >     >     >            nc->mark = alg_exp->master_mark;
    >     >     >     >            nc->label = alg_exp->master_label;
    >     >     >     >            nc->master_key = alg_exp->master_key;
    >     >     >     >        }
    >     >     >     >
    >     >     >     >        if (nat_action_info) {
    >     >     >     > ^ true, execute this part
    >     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >
    >     >     >     >            if (alg_exp) {
    >     >     >     > ^ false; skip to else
    >     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >                *conn_for_un_nat_copy = *nc;
    >     >     >     >            } else {
    >     >     >     > ^ We go through this condition
    >     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >                bool nat_res = nat_select_range_tuple(
    >     >     >     >                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >
    >     >     >     >                if (!nat_res) {
    >     >     >     > ^ false; do not execute this block
    >     >     >     >                    free(nc->nat_info);
    >     >     >     >                    nc->nat_info = NULL;
    >     >     >     >                    free (nc);
    >     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >                    return NULL;
    >     >     >     >                }
    >     >     >     >
    >     >     >     >                *nc = *conn_for_un_nat_copy;
    >     >     >     > ^ Now:
    >     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
    >     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >     >     >     >
    >     >     >     > We don't free either of these.
    >     >     >
    >     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    >     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
    >     >     >     that the above is not a problem... but what if (!nat_res) ? Then
    >     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
    >     >     >     parameters which are freed from 'nc' inside that block, then
    >     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    >     >     >     then?
    >     >     >
    >     >     > Nope, because there is no un_nat conn.
    >     >
    >     >     So you mean that dangling references are returned inside
    >     >     conn_for_un_nat_copy but they're just not used?
    >     >
    >     > JTBC, nothing can be used, since there is not even a connection.
    >
    >     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
    >     *conn;" line which updates conn_for_un_nat_copy, then
    >     nat_select_range_tuple() fails to select a tuple and returns false,
    >     then the if (!nat_res) cleanup / return NULL path is called, then what
    >     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
    >
    > conn_for_un_nat_copy is a local variable in the caller
    
    Which is then potentially passed to another function,
    create_un_nat_conn(), which then does an xmemdup() to propagate those
    dangling pointers further.. seems like it would be safest to clear out
    the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
    'nc' in that path.

No, as I stated already, since the un_nat creation (and also conn) never completed
the conntype is not set and create_un_nat_conn() is not called.
Joe Stringer June 27, 2017, 8:20 p.m. UTC | #14
On 27 June 2017 at 11:19, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/27/17, 11:04 AM, "Joe Stringer" <joe@ovn.org> wrote:
>
>     On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
>     >
>     >
>     > On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >
>     >     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
>     >     >
>     >     >
>     >     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >
>     >     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >
>     >     >     >
>     >     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >
>     >     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
>     >     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >     >     >     >>     >     >          nc->rev_key = nc->key;
>     >     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
>     >     >     >     >>     >     >
>     >     >     >     >>     >     > +        if (helper) {
>     >     >     >     >>     >     > +            nc->alg = xstrdup(helper);
>     >     >     >     >>     >     > +        }
>     >     >     >     >>     >     > +
>     >     >     >     >>     >     > +        if (alg_exp) {
>     >     >     >     >>     >     > +            nc->alg_related = true;
>     >     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
>     >     >     >     >>     >     > +            nc->label = alg_exp->master_label;
>     >     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
>     >     >     >     >>     >     > +        }
>     >     >     >     >>     >     > +
>     >     >     >     >>     >     >          if (nat_action_info) {
>     >     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >     >     >     >>     >     > -
>     >     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
>     >     >     >     >>     >     >
>     >     >     >     >>     >     > -            if (!nat_res) {
>     >     >     >     >>     >     > -                free(nc->nat_info);
>     >     >     >     >>     >     > -                nc->nat_info = NULL;
>     >     >     >     >>     >     > -                free (nc);
>     >     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >     >     >     >>     >     > -                return NULL;
>     >     >     >     >>     >     > -            }
>     >     >     >     >>     >     > +            if (alg_exp) {
>     >     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
>     >     >     >     >>     >     > +            } else {
>     >     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
>     >     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >     >>     >     > +
>     >     >     >     >>     >     > +                if (!nat_res) {
>     >     >     >     >>     >     > +                    free(nc->nat_info);
>     >     >     >     >>     >     > +                    nc->nat_info = NULL;
>     >     >     >     >>     >     > +                    free (nc);
>     >     >     >     >>     >
>     >     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >     >     >     >>     >     delete_conn()?
>     >     >     >     >>     >
>     >     >     >     >>     > Good
>     >     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     >     >     >     >>     > here, as everywhere.
>     >     >     >     >>
>     >     >     >     >>     OK.
>     >     >     >     >>
>     >     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >     >>     >     > +                    return NULL;
>     >     >     >     >>     >     > +                }
>     >     >     >     >>     >     >
>     >     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
>     >     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
>     >     >     >     >>     >
>     >     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >     >     >     >>     >
>     >     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>     >     >     >     >>
>     >     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     >     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>     >     >     >     >>
>     >     >     >     >> I see your question now.
>     >     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>     >     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
>     >     >     >     >> There is only one allocation set.
>     >     >     >     >
>     >     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
>     >     >     >     > step below and let's see where it goes.
>     >     >     >     >
>     >     >     >     >        if (helper) {
>     >     >     >     >            nc->alg = xstrdup(helper);
>     >     >     >     > ^ nc->alg is set
>     >     >     >     >        }
>     >     >     >     >
>     >     >     >     >        if (alg_exp) {
>     >     >     >     > ^ false; do not execute this block
>     >     >     >     >            nc->alg_related = true;
>     >     >     >     >            nc->mark = alg_exp->master_mark;
>     >     >     >     >            nc->label = alg_exp->master_label;
>     >     >     >     >            nc->master_key = alg_exp->master_key;
>     >     >     >     >        }
>     >     >     >     >
>     >     >     >     >        if (nat_action_info) {
>     >     >     >     > ^ true, execute this part
>     >     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >     >
>     >     >     >     >            if (alg_exp) {
>     >     >     >     > ^ false; skip to else
>     >     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >     >                *conn_for_un_nat_copy = *nc;
>     >     >     >     >            } else {
>     >     >     >     > ^ We go through this condition
>     >     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >     >                bool nat_res = nat_select_range_tuple(
>     >     >     >     >                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >     >
>     >     >     >     >                if (!nat_res) {
>     >     >     >     > ^ false; do not execute this block
>     >     >     >     >                    free(nc->nat_info);
>     >     >     >     >                    nc->nat_info = NULL;
>     >     >     >     >                    free (nc);
>     >     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >     >                    return NULL;
>     >     >     >     >                }
>     >     >     >     >
>     >     >     >     >                *nc = *conn_for_un_nat_copy;
>     >     >     >     > ^ Now:
>     >     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
>     >     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>     >     >     >     >
>     >     >     >     > We don't free either of these.
>     >     >     >
>     >     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
>     >     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
>     >     >     >     that the above is not a problem... but what if (!nat_res) ? Then
>     >     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
>     >     >     >     parameters which are freed from 'nc' inside that block, then
>     >     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
>     >     >     >     then?
>     >     >     >
>     >     >     > Nope, because there is no un_nat conn.
>     >     >
>     >     >     So you mean that dangling references are returned inside
>     >     >     conn_for_un_nat_copy but they're just not used?
>     >     >
>     >     > JTBC, nothing can be used, since there is not even a connection.
>     >
>     >     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
>     >     *conn;" line which updates conn_for_un_nat_copy, then
>     >     nat_select_range_tuple() fails to select a tuple and returns false,
>     >     then the if (!nat_res) cleanup / return NULL path is called, then what
>     >     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
>     >
>     > conn_for_un_nat_copy is a local variable in the caller
>
>     Which is then potentially passed to another function,
>     create_un_nat_conn(), which then does an xmemdup() to propagate those
>     dangling pointers further.. seems like it would be safest to clear out
>     the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
>     'nc' in that path.
>
> No, as I stated already, since the un_nat creation (and also conn) never completed
> the conntype is not set and create_un_nat_conn() is not called.

OK, now I see what you mean.

I think that clearing these pointers in that error path would make the
code more explicit about the status of one of the parameters modified
by this function, and would limit side-effects. I don't see a negative
to this proposed change.
Darrell Ball June 27, 2017, 8:40 p.m. UTC | #15
On 6/27/17, 1:20 PM, "Joe Stringer" <joe@ovn.org> wrote:

    On 27 June 2017 at 11:19, Darrell Ball <dball@vmware.com> wrote:
    >
    >
    > On 6/27/17, 11:04 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >
    >     On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
    >     >
    >     >
    >     > On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >
    >     >     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
    >     >     >
    >     >     >
    >     >     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >
    >     >     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >
    >     >     >     >
    >     >     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >
    >     >     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    >     >     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >     >     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >     >     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >     >     >     >     >>     >     >          nc->rev_key = nc->key;
    >     >     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
    >     >     >     >     >>     >     >
    >     >     >     >     >>     >     > +        if (helper) {
    >     >     >     >     >>     >     > +            nc->alg = xstrdup(helper);
    >     >     >     >     >>     >     > +        }
    >     >     >     >     >>     >     > +
    >     >     >     >     >>     >     > +        if (alg_exp) {
    >     >     >     >     >>     >     > +            nc->alg_related = true;
    >     >     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
    >     >     >     >     >>     >     > +            nc->label = alg_exp->master_label;
    >     >     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
    >     >     >     >     >>     >     > +        }
    >     >     >     >     >>     >     > +
    >     >     >     >     >>     >     >          if (nat_action_info) {
    >     >     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >     >     >     >     >>     >     > -
    >     >     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >     >     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
    >     >     >     >     >>     >     >
    >     >     >     >     >>     >     > -            if (!nat_res) {
    >     >     >     >     >>     >     > -                free(nc->nat_info);
    >     >     >     >     >>     >     > -                nc->nat_info = NULL;
    >     >     >     >     >>     >     > -                free (nc);
    >     >     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >     >     >     >     >>     >     > -                return NULL;
    >     >     >     >     >>     >     > -            }
    >     >     >     >     >>     >     > +            if (alg_exp) {
    >     >     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
    >     >     >     >     >>     >     > +            } else {
    >     >     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
    >     >     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >     >>     >     > +
    >     >     >     >     >>     >     > +                if (!nat_res) {
    >     >     >     >     >>     >     > +                    free(nc->nat_info);
    >     >     >     >     >>     >     > +                    nc->nat_info = NULL;
    >     >     >     >     >>     >     > +                    free (nc);
    >     >     >     >     >>     >
    >     >     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >     >     >     >     >>     >     delete_conn()?
    >     >     >     >     >>     >
    >     >     >     >     >>     > Good
    >     >     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >     >     >     >     >>     > here, as everywhere.
    >     >     >     >     >>
    >     >     >     >     >>     OK.
    >     >     >     >     >>
    >     >     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >     >>     >     > +                    return NULL;
    >     >     >     >     >>     >     > +                }
    >     >     >     >     >>     >     >
    >     >     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
    >     >     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >     >     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
    >     >     >     >     >>     >
    >     >     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >     >     >     >     >>     >
    >     >     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >     >     >     >     >>
    >     >     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >     >     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >     >     >     >     >>
    >     >     >     >     >> I see your question now.
    >     >     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >     >     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
    >     >     >     >     >> There is only one allocation set.
    >     >     >     >     >
    >     >     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
    >     >     >     >     > step below and let's see where it goes.
    >     >     >     >     >
    >     >     >     >     >        if (helper) {
    >     >     >     >     >            nc->alg = xstrdup(helper);
    >     >     >     >     > ^ nc->alg is set
    >     >     >     >     >        }
    >     >     >     >     >
    >     >     >     >     >        if (alg_exp) {
    >     >     >     >     > ^ false; do not execute this block
    >     >     >     >     >            nc->alg_related = true;
    >     >     >     >     >            nc->mark = alg_exp->master_mark;
    >     >     >     >     >            nc->label = alg_exp->master_label;
    >     >     >     >     >            nc->master_key = alg_exp->master_key;
    >     >     >     >     >        }
    >     >     >     >     >
    >     >     >     >     >        if (nat_action_info) {
    >     >     >     >     > ^ true, execute this part
    >     >     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >     >
    >     >     >     >     >            if (alg_exp) {
    >     >     >     >     > ^ false; skip to else
    >     >     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >     >                *conn_for_un_nat_copy = *nc;
    >     >     >     >     >            } else {
    >     >     >     >     > ^ We go through this condition
    >     >     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >     >                bool nat_res = nat_select_range_tuple(
    >     >     >     >     >                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >     >
    >     >     >     >     >                if (!nat_res) {
    >     >     >     >     > ^ false; do not execute this block
    >     >     >     >     >                    free(nc->nat_info);
    >     >     >     >     >                    nc->nat_info = NULL;
    >     >     >     >     >                    free (nc);
    >     >     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >     >                    return NULL;
    >     >     >     >     >                }
    >     >     >     >     >
    >     >     >     >     >                *nc = *conn_for_un_nat_copy;
    >     >     >     >     > ^ Now:
    >     >     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
    >     >     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >     >     >     >     >
    >     >     >     >     > We don't free either of these.
    >     >     >     >
    >     >     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    >     >     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
    >     >     >     >     that the above is not a problem... but what if (!nat_res) ? Then
    >     >     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
    >     >     >     >     parameters which are freed from 'nc' inside that block, then
    >     >     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    >     >     >     >     then?
    >     >     >     >
    >     >     >     > Nope, because there is no un_nat conn.
    >     >     >
    >     >     >     So you mean that dangling references are returned inside
    >     >     >     conn_for_un_nat_copy but they're just not used?
    >     >     >
    >     >     > JTBC, nothing can be used, since there is not even a connection.
    >     >
    >     >     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
    >     >     *conn;" line which updates conn_for_un_nat_copy, then
    >     >     nat_select_range_tuple() fails to select a tuple and returns false,
    >     >     then the if (!nat_res) cleanup / return NULL path is called, then what
    >     >     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
    >     >
    >     > conn_for_un_nat_copy is a local variable in the caller
    >
    >     Which is then potentially passed to another function,
    >     create_un_nat_conn(), which then does an xmemdup() to propagate those
    >     dangling pointers further.. seems like it would be safest to clear out
    >     the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
    >     'nc' in that path.
    >
    > No, as I stated already, since the un_nat creation (and also conn) never completed
    > the conntype is not set and create_un_nat_conn() is not called.
    
    OK, now I see what you mean.
    
    I think that clearing these pointers in that error path would make the
    code more explicit about the status of one of the parameters modified
    by this function, and would limit side-effects. I don't see a negative
    to this proposed change.

Adding redundant clearing of pointers in the error path will not hurt
performance, so there is no problem there.
It feels a little weird to clear a couple pointers in a case where the whole connection
context is dead.  It feels like painting your house while it is burning down.
Maybe a comment would serve the same purpose.
Joe Stringer June 27, 2017, 10 p.m. UTC | #16
On 27 June 2017 at 13:40, Darrell Ball <dball@vmware.com> wrote:
>
>
> On 6/27/17, 1:20 PM, "Joe Stringer" <joe@ovn.org> wrote:
>
>     On 27 June 2017 at 11:19, Darrell Ball <dball@vmware.com> wrote:
>     >
>     >
>     > On 6/27/17, 11:04 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >
>     >     On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
>     >     >
>     >     >
>     >     > On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >
>     >     >     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >
>     >     >     >
>     >     >     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >
>     >     >     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >     >
>     >     >     >     >
>     >     >     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >     >
>     >     >     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
>     >     >     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
>     >     >     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
>     >     >     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
>     >     >     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
>     >     >     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>     >     >     >     >     >>     >     >          nc->rev_key = nc->key;
>     >     >     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
>     >     >     >     >     >>     >     >
>     >     >     >     >     >>     >     > +        if (helper) {
>     >     >     >     >     >>     >     > +            nc->alg = xstrdup(helper);
>     >     >     >     >     >>     >     > +        }
>     >     >     >     >     >>     >     > +
>     >     >     >     >     >>     >     > +        if (alg_exp) {
>     >     >     >     >     >>     >     > +            nc->alg_related = true;
>     >     >     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
>     >     >     >     >     >>     >     > +            nc->label = alg_exp->master_label;
>     >     >     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
>     >     >     >     >     >>     >     > +        }
>     >     >     >     >     >>     >     > +
>     >     >     >     >     >>     >     >          if (nat_action_info) {
>     >     >     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
>     >     >     >     >     >>     >     > -
>     >     >     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
>     >     >     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
>     >     >     >     >     >>     >     >
>     >     >     >     >     >>     >     > -            if (!nat_res) {
>     >     >     >     >     >>     >     > -                free(nc->nat_info);
>     >     >     >     >     >>     >     > -                nc->nat_info = NULL;
>     >     >     >     >     >>     >     > -                free (nc);
>     >     >     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
>     >     >     >     >     >>     >     > -                return NULL;
>     >     >     >     >     >>     >     > -            }
>     >     >     >     >     >>     >     > +            if (alg_exp) {
>     >     >     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
>     >     >     >     >     >>     >     > +            } else {
>     >     >     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
>     >     >     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >     >     >>     >     > +
>     >     >     >     >     >>     >     > +                if (!nat_res) {
>     >     >     >     >     >>     >     > +                    free(nc->nat_info);
>     >     >     >     >     >>     >     > +                    nc->nat_info = NULL;
>     >     >     >     >     >>     >     > +                    free (nc);
>     >     >     >     >     >>     >
>     >     >     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
>     >     >     >     >     >>     >     delete_conn()?
>     >     >     >     >     >>     >
>     >     >     >     >     >>     > Good
>     >     >     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
>     >     >     >     >     >>     > here, as everywhere.
>     >     >     >     >     >>
>     >     >     >     >     >>     OK.
>     >     >     >     >     >>
>     >     >     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >     >     >>     >     > +                    return NULL;
>     >     >     >     >     >>     >     > +                }
>     >     >     >     >     >>     >     >
>     >     >     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
>     >     >     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>     >     >     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
>     >     >     >     >     >>     >
>     >     >     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
>     >     >     >     >     >>     >
>     >     >     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
>     >     >     >     >     >>
>     >     >     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
>     >     >     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
>     >     >     >     >     >>
>     >     >     >     >     >> I see your question now.
>     >     >     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
>     >     >     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
>     >     >     >     >     >> There is only one allocation set.
>     >     >     >     >     >
>     >     >     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
>     >     >     >     >     > step below and let's see where it goes.
>     >     >     >     >     >
>     >     >     >     >     >        if (helper) {
>     >     >     >     >     >            nc->alg = xstrdup(helper);
>     >     >     >     >     > ^ nc->alg is set
>     >     >     >     >     >        }
>     >     >     >     >     >
>     >     >     >     >     >        if (alg_exp) {
>     >     >     >     >     > ^ false; do not execute this block
>     >     >     >     >     >            nc->alg_related = true;
>     >     >     >     >     >            nc->mark = alg_exp->master_mark;
>     >     >     >     >     >            nc->label = alg_exp->master_label;
>     >     >     >     >     >            nc->master_key = alg_exp->master_key;
>     >     >     >     >     >        }
>     >     >     >     >     >
>     >     >     >     >     >        if (nat_action_info) {
>     >     >     >     >     > ^ true, execute this part
>     >     >     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
>     >     >     >     >     >
>     >     >     >     >     >            if (alg_exp) {
>     >     >     >     >     > ^ false; skip to else
>     >     >     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
>     >     >     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
>     >     >     >     >     >                *conn_for_un_nat_copy = *nc;
>     >     >     >     >     >            } else {
>     >     >     >     >     > ^ We go through this condition
>     >     >     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
>     >     >     >     >     >                bool nat_res = nat_select_range_tuple(
>     >     >     >     >     >                                   ct, nc, conn_for_un_nat_copy);
>     >     >     >     >     >
>     >     >     >     >     >                if (!nat_res) {
>     >     >     >     >     > ^ false; do not execute this block
>     >     >     >     >     >                    free(nc->nat_info);
>     >     >     >     >     >                    nc->nat_info = NULL;
>     >     >     >     >     >                    free (nc);
>     >     >     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
>     >     >     >     >     >                    return NULL;
>     >     >     >     >     >                }
>     >     >     >     >     >
>     >     >     >     >     >                *nc = *conn_for_un_nat_copy;
>     >     >     >     >     > ^ Now:
>     >     >     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
>     >     >     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
>     >     >     >     >     >
>     >     >     >     >     > We don't free either of these.
>     >     >     >     >
>     >     >     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
>     >     >     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
>     >     >     >     >     that the above is not a problem... but what if (!nat_res) ? Then
>     >     >     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
>     >     >     >     >     parameters which are freed from 'nc' inside that block, then
>     >     >     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
>     >     >     >     >     then?
>     >     >     >     >
>     >     >     >     > Nope, because there is no un_nat conn.
>     >     >     >
>     >     >     >     So you mean that dangling references are returned inside
>     >     >     >     conn_for_un_nat_copy but they're just not used?
>     >     >     >
>     >     >     > JTBC, nothing can be used, since there is not even a connection.
>     >     >
>     >     >     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
>     >     >     *conn;" line which updates conn_for_un_nat_copy, then
>     >     >     nat_select_range_tuple() fails to select a tuple and returns false,
>     >     >     then the if (!nat_res) cleanup / return NULL path is called, then what
>     >     >     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
>     >     >
>     >     > conn_for_un_nat_copy is a local variable in the caller
>     >
>     >     Which is then potentially passed to another function,
>     >     create_un_nat_conn(), which then does an xmemdup() to propagate those
>     >     dangling pointers further.. seems like it would be safest to clear out
>     >     the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
>     >     'nc' in that path.
>     >
>     > No, as I stated already, since the un_nat creation (and also conn) never completed
>     > the conntype is not set and create_un_nat_conn() is not called.
>
>     OK, now I see what you mean.
>
>     I think that clearing these pointers in that error path would make the
>     code more explicit about the status of one of the parameters modified
>     by this function, and would limit side-effects. I don't see a negative
>     to this proposed change.
>
> Adding redundant clearing of pointers in the error path will not hurt
> performance, so there is no problem there.

Right.

> It feels a little weird to clear a couple pointers in a case where the whole connection
> context is dead.  It feels like painting your house while it is burning down.
> Maybe a comment would serve the same purpose.

Comments could help to some degree, but I think that my core concerns
could be addressed with improvements to the code:
* nat_select_range_tuple() sounds like it will select a range. Not
only does it select the range, but it does full initialization of
conn_for_un_nat_copy as well (via shallow-copy)
* conn_not_found() allocates memory, then nat_select_range_tuple()
shares pointers to that memory around, and yet the pointers don't
share fate when conn_not_found() frees them. This makes it difficult
to read the code and balance alloc/free.
* nc seems to be the owner of these memory allocations, but since
they're propagated into conn_for_un_nat_copy, it's less clear which
conn is supposed to own it.
* The condition where these dangling pointers are shared up to the
caller is not the exact same condition check in the original caller
which prevents usage of those dangling pointers.

A bunch of the above concerns aren't specific to this patch, they
already exist in the upstream codebase and I'm probably just noticing
because it's really new code and it's my first real dive into this
area. But I do think there are improvements to be made in limiting the
scope of potential changes from each function, and localizing things
like initialization, memory alloc/free/ownership, etc. into ideally
the same function.
Darrell Ball June 28, 2017, 11:01 p.m. UTC | #17
On 6/27/17, 3:00 PM, "Joe Stringer" <joe@ovn.org> wrote:

    On 27 June 2017 at 13:40, Darrell Ball <dball@vmware.com> wrote:
    >
    >
    > On 6/27/17, 1:20 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >
    >     On 27 June 2017 at 11:19, Darrell Ball <dball@vmware.com> wrote:
    >     >
    >     >
    >     > On 6/27/17, 11:04 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >
    >     >     On 27 June 2017 at 10:51, Darrell Ball <dball@vmware.com> wrote:
    >     >     >
    >     >     >
    >     >     > On 6/27/17, 10:47 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >
    >     >     >     On 27 June 2017 at 10:42, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >
    >     >     >     >
    >     >     >     > On 6/27/17, 10:40 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >
    >     >     >     >     On 26 June 2017 at 18:19, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >     >
    >     >     >     >     >
    >     >     >     >     > On 6/26/17, 4:49 PM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >     >
    >     >     >     >     >     On 26 June 2017 at 13:22, Joe Stringer <joe@ovn.org> wrote:
    >     >     >     >     >     > On 26 June 2017 at 11:45, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >     >     >> On 6/26/17, 10:22 AM, "Joe Stringer" <joe@ovn.org> wrote:
    >     >     >     >     >     >>     On 23 June 2017 at 18:57, Darrell Ball <dball@vmware.com> wrote:
    >     >     >     >     >     >>     > On 6/23/17, 4:08 PM, "ovs-dev-bounces@openvswitch.org on behalf of Joe Stringer" <ovs-dev-bounces@openvswitch.org on behalf of joe@ovn.org> wrote:
    >     >     >     >     >     >>     >     On 17 June 2017 at 15:53, Darrell Ball <dlu998@gmail.com> wrote:
    >     >     >     >     >     >>     >     > @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >     >     >     >     >     >>     >     >          nc->rev_key = nc->key;
    >     >     >     >     >     >>     >     >          conn_key_reverse(&nc->rev_key);
    >     >     >     >     >     >>     >     >
    >     >     >     >     >     >>     >     > +        if (helper) {
    >     >     >     >     >     >>     >     > +            nc->alg = xstrdup(helper);
    >     >     >     >     >     >>     >     > +        }
    >     >     >     >     >     >>     >     > +
    >     >     >     >     >     >>     >     > +        if (alg_exp) {
    >     >     >     >     >     >>     >     > +            nc->alg_related = true;
    >     >     >     >     >     >>     >     > +            nc->mark = alg_exp->master_mark;
    >     >     >     >     >     >>     >     > +            nc->label = alg_exp->master_label;
    >     >     >     >     >     >>     >     > +            nc->master_key = alg_exp->master_key;
    >     >     >     >     >     >>     >     > +        }
    >     >     >     >     >     >>     >     > +
    >     >     >     >     >     >>     >     >          if (nat_action_info) {
    >     >     >     >     >     >>     >     >              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >     >     >>     >     > -            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >     >     >     >     >     >>     >     > -
    >     >     >     >     >     >>     >     > -            bool nat_res = nat_select_range_tuple(ct, nc,
    >     >     >     >     >     >>     >     > -                                                  conn_for_un_nat_copy);
    >     >     >     >     >     >>     >     >
    >     >     >     >     >     >>     >     > -            if (!nat_res) {
    >     >     >     >     >     >>     >     > -                free(nc->nat_info);
    >     >     >     >     >     >>     >     > -                nc->nat_info = NULL;
    >     >     >     >     >     >>     >     > -                free (nc);
    >     >     >     >     >     >>     >     > -                ct_rwlock_unlock(&ct->nat_resources_lock);
    >     >     >     >     >     >>     >     > -                return NULL;
    >     >     >     >     >     >>     >     > -            }
    >     >     >     >     >     >>     >     > +            if (alg_exp) {
    >     >     >     >     >     >>     >     > +                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >     >     >>     >     > +                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >     >     >>     >     > +                *conn_for_un_nat_copy = *nc;
    >     >     >     >     >     >>     >     > +            } else {
    >     >     >     >     >     >>     >     > +                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >     >     >>     >     > +                bool nat_res = nat_select_range_tuple(
    >     >     >     >     >     >>     >     > +                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >     >     >>     >     > +
    >     >     >     >     >     >>     >     > +                if (!nat_res) {
    >     >     >     >     >     >>     >     > +                    free(nc->nat_info);
    >     >     >     >     >     >>     >     > +                    nc->nat_info = NULL;
    >     >     >     >     >     >>     >     > +                    free (nc);
    >     >     >     >     >     >>     >
    >     >     >     >     >     >>     >     I think that nc->alg may be leaked here? any reason it doesn't use
    >     >     >     >     >     >>     >     delete_conn()?
    >     >     >     >     >     >>     >
    >     >     >     >     >     >>     > Good
    >     >     >     >     >     >>     > Yes, alg will leak in this rare error case and yes, delete_conn() should be used
    >     >     >     >     >     >>     > here, as everywhere.
    >     >     >     >     >     >>
    >     >     >     >     >     >>     OK.
    >     >     >     >     >     >>
    >     >     >     >     >     >>     >     > +                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >     >     >>     >     > +                    return NULL;
    >     >     >     >     >     >>     >     > +                }
    >     >     >     >     >     >>     >     >
    >     >     >     >     >     >>     >     > -            if (conn_for_un_nat_copy &&
    >     >     >     >     >     >>     >     > -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    >     >     >     >     >     >>     >     >                  *nc = *conn_for_un_nat_copy;
    >     >     >     >     >     >>     >
    >     >     >     >     >     >>     >     Perhaps nc->alg and/or nc->nat_info may be leaked here?
    >     >     >     >     >     >>     >
    >     >     >     >     >     >>     > No, the un_nat conn has no such allocations, so there is nothing to leak.
    >     >     >     >     >     >>
    >     >     >     >     >     >>     I don't mean conn_for_un_nat_copy, I mean *nc which could have had an
    >     >     >     >     >     >>     xstrdup()'d 'alg' attached. Won't this overwrite all fields in 'nc'?
    >     >     >     >     >     >>
    >     >     >     >     >     >> I see your question now.
    >     >     >     >     >     >> No, at this point, the copy gets the same pointers to the alg string and nat_info.
    >     >     >     >     >     >> Only nc needs them and the un_nat copy ptrs are nulled.
    >     >     >     >     >     >> There is only one allocation set.
    >     >     >     >     >     >
    >     >     >     >     >     > Hmm. Maybe I'm just missing something, let me walk through it step by
    >     >     >     >     >     > step below and let's see where it goes.
    >     >     >     >     >     >
    >     >     >     >     >     >        if (helper) {
    >     >     >     >     >     >            nc->alg = xstrdup(helper);
    >     >     >     >     >     > ^ nc->alg is set
    >     >     >     >     >     >        }
    >     >     >     >     >     >
    >     >     >     >     >     >        if (alg_exp) {
    >     >     >     >     >     > ^ false; do not execute this block
    >     >     >     >     >     >            nc->alg_related = true;
    >     >     >     >     >     >            nc->mark = alg_exp->master_mark;
    >     >     >     >     >     >            nc->label = alg_exp->master_label;
    >     >     >     >     >     >            nc->master_key = alg_exp->master_key;
    >     >     >     >     >     >        }
    >     >     >     >     >     >
    >     >     >     >     >     >        if (nat_action_info) {
    >     >     >     >     >     > ^ true, execute this part
    >     >     >     >     >     >            nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
    >     >     >     >     >     >
    >     >     >     >     >     >            if (alg_exp) {
    >     >     >     >     >     > ^ false; skip to else
    >     >     >     >     >     >                nc->rev_key.src.addr = alg_nat_repl_addr;
    >     >     >     >     >     >                nc->nat_info->nat_action = NAT_ACTION_DST;
    >     >     >     >     >     >                *conn_for_un_nat_copy = *nc;
    >     >     >     >     >     >            } else {
    >     >     >     >     >     > ^ We go through this condition
    >     >     >     >     >     >                ct_rwlock_wrlock(&ct->resources_lock);
    >     >     >     >     >     >                bool nat_res = nat_select_range_tuple(
    >     >     >     >     >     >                                   ct, nc, conn_for_un_nat_copy);
    >     >     >     >     >     >
    >     >     >     >     >     >                if (!nat_res) {
    >     >     >     >     >     > ^ false; do not execute this block
    >     >     >     >     >     >                    free(nc->nat_info);
    >     >     >     >     >     >                    nc->nat_info = NULL;
    >     >     >     >     >     >                    free (nc);
    >     >     >     >     >     >                    ct_rwlock_unlock(&ct->resources_lock);
    >     >     >     >     >     >                    return NULL;
    >     >     >     >     >     >                }
    >     >     >     >     >     >
    >     >     >     >     >     >                *nc = *conn_for_un_nat_copy;
    >     >     >     >     >     > ^ Now:
    >     >     >     >     >     > nc->alg is overwritten by conn_for_un_nat_copy->alg
    >     >     >     >     >     > nc->nat_info is overwritten by conn_for_un_nat_copy->nat_info
    >     >     >     >     >     >
    >     >     >     >     >     > We don't free either of these.
    >     >     >     >     >
    >     >     >     >     >     As discussed offline, the copy of '*nc' into '*conn_for_un_nat_copy'
    >     >     >     >     >     nested inside nat_select_range_tuple() is very well hidden. This means
    >     >     >     >     >     that the above is not a problem... but what if (!nat_res) ? Then
    >     >     >     >     >     conn_for_un_nat_copy() has a reference to these alg/nat_info
    >     >     >     >     >     parameters which are freed from 'nc' inside that block, then
    >     >     >     >     >     'conn_for_un_nat_copy' is returned. Could there be a use-after-free
    >     >     >     >     >     then?
    >     >     >     >     >
    >     >     >     >     > Nope, because there is no un_nat conn.
    >     >     >     >
    >     >     >     >     So you mean that dangling references are returned inside
    >     >     >     >     conn_for_un_nat_copy but they're just not used?
    >     >     >     >
    >     >     >     > JTBC, nothing can be used, since there is not even a connection.
    >     >     >
    >     >     >     So if nat_select_range_tuple() is called, and it runs the "*nat_conn =
    >     >     >     *conn;" line which updates conn_for_un_nat_copy, then
    >     >     >     nat_select_range_tuple() fails to select a tuple and returns false,
    >     >     >     then the if (!nat_res) cleanup / return NULL path is called, then what
    >     >     >     cleans up the dangling pointers that are now in conn_for_un_nat_copy?
    >     >     >
    >     >     > conn_for_un_nat_copy is a local variable in the caller
    >     >
    >     >     Which is then potentially passed to another function,
    >     >     create_un_nat_conn(), which then does an xmemdup() to propagate those
    >     >     dangling pointers further.. seems like it would be safest to clear out
    >     >     the conn_for_un_nat_copy->alg and ->nat_info pointers before freeing
    >     >     'nc' in that path.
    >     >
    >     > No, as I stated already, since the un_nat creation (and also conn) never completed
    >     > the conntype is not set and create_un_nat_conn() is not called.
    >
    >     OK, now I see what you mean.
    >
    >     I think that clearing these pointers in that error path would make the
    >     code more explicit about the status of one of the parameters modified
    >     by this function, and would limit side-effects. I don't see a negative
    >     to this proposed change.
    >
    > Adding redundant clearing of pointers in the error path will not hurt
    > performance, so there is no problem there.
    
    Right.
    
    > It feels a little weird to clear a couple pointers in a case where the whole connection
    > context is dead.  It feels like painting your house while it is burning down.
    > Maybe a comment would serve the same purpose.
    
    Comments could help to some degree, but I think that my core concerns
    could be addressed with improvements to the code:
    * nat_select_range_tuple() sounds like it will select a range. Not
    only does it select the range, but it does full initialization of
    conn_for_un_nat_copy as well (via shallow-copy)
    * conn_not_found() allocates memory, then nat_select_range_tuple()
    shares pointers to that memory around, and yet the pointers don't
    share fate when conn_not_found() frees them. This makes it difficult
    to read the code and balance alloc/free.

I looked and noticed that the conn copy in nat_select_range_tuple() was never
needed there; I moved it to conn_not_found(). I agree it was not clear.

    * nc seems to be the owner of these memory allocations, but since
    they're propagated into conn_for_un_nat_copy, it's less clear which
    conn is supposed to own it.
    * The condition where these dangling pointers are shared up to the
    caller is not the exact same condition check in the original caller
    which prevents usage of those dangling pointers.

In the error case, I added a memset zero (with comment) of the
conn_for_un_nat_copy structure to document that the structure will
not be further used.

In the non-error case, those pointers are presently explicitly nulled.
conn_for_un_nat_copy->nat_info = NULL;
conn_for_un_nat_copy->alg = NULL;
I think this makes it clear that conn_for_un_nat_copy is not the owner 
of these pointers.

    A bunch of the above concerns aren't specific to this patch, they
    already exist in the upstream codebase and I'm probably just noticing
    because it's really new code and it's my first real dive into this
    area. But I do think there are improvements to be made in limiting the
    scope of potential changes from each function, and localizing things
    like initialization, memory alloc/free/ownership, etc. into ideally
    the same function.
diff mbox

Patch

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index 55084d3..993af33 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -62,17 +62,34 @@  struct nat_conn_key_node {
     struct conn_key value;
 };
 
+struct alg_exp_node {
+    struct hmap_node node;
+    struct ovs_list exp_node;
+    long long expiration;
+    struct conn_key key;
+    struct conn_key master_key;
+    struct ct_addr alg_nat_repl_addr;
+    ovs_u128 master_label;
+    uint32_t master_mark;
+};
+
 struct conn {
     struct conn_key key;
     struct conn_key rev_key;
+    /* Only used for orig_tuple support. */
+    struct conn_key master_key;
     long long expiration;
     struct ovs_list exp_node;
     struct hmap_node node;
     ovs_u128 label;
     /* XXX: consider flattening. */
     struct nat_action_info_t *nat_info;
+    char *alg;
+    int seq_skew;
     uint32_t mark;
     uint8_t conn_type;
+    uint8_t seq_skew_dir;
+    uint8_t alg_related;
 };
 
 enum ct_update_res {
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 90b154a..1f54fe3 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -1,5 +1,5 @@ 
 /*
- * Copyright (c) 2015, 2016 Nicira, Inc.
+ * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ 
 #include <sys/types.h>
 #include <netinet/in.h>
 #include <netinet/icmp6.h>
+#include <ctype.h>
 
 #include "bitmap.h"
 #include "conntrack-private.h"
@@ -39,7 +40,6 @@ 
 #include "random.h"
 #include "timeval.h"
 
-
 VLOG_DEFINE_THIS_MODULE(conntrack);
 
 COVERAGE_DEFINE(conntrack_full);
@@ -50,9 +50,21 @@  struct conn_lookup_ctx {
     struct conn *conn;
     uint32_t hash;
     bool reply;
+    /* XXX: Only used by ICMP; change name. */
     bool related;
 };
 
+enum ftp_ctl_pkt {
+    CT_FTP_CTL_INTEREST,
+    CT_FTP_CTL_OTHER,
+    CT_FTP_CTL_INVALID,
+};
+
+enum ct_alg_mode {
+    CT_FTP_MODE_ACTIVE,
+    CT_FTP_MODE_PASSIVE,
+};
+
 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
                              ovs_be16 dl_type, struct conn_lookup_ctx *,
                              uint16_t zone);
@@ -102,6 +114,35 @@  static inline bool
 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
                 const char **new_data);
 
+static struct alg_exp_node *
+expectation_lookup(struct hmap *alg_expectations,
+                   const struct conn_key *key,
+                   uint32_t basis);
+
+static int
+repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
+                 char *ftp_data_v4_start,
+                 size_t addr_offset_from_ftp_data_start);
+
+static enum ftp_ctl_pkt
+process_ftp_ctl_v4(struct conntrack *ct,
+                   struct conn_lookup_ctx *ctx,
+                   struct dp_packet *pkt,
+                   const struct conn *conn_for_expectation,
+                   long long now, ovs_be32 *v4_addr_rep,
+                   char **ftp_data_v4_start,
+                   size_t *addr_offset_from_ftp_data_start);
+
+static enum ftp_ctl_pkt
+detect_ftp_ctl_mode(struct conn_lookup_ctx *ctx, struct dp_packet *pkt,
+                    char *ftp_msg);
+
+static void
+handle_ftp_ctl(struct conntrack *ct, struct conn_lookup_ctx *ctx,
+               struct dp_packet *pkt,
+               const struct conn *conn_for_expectation,
+               long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
+
 static struct ct_l4_proto *l4_protos[] = {
     [IPPROTO_TCP] = &ct_proto_tcp,
     [IPPROTO_UDP] = &ct_proto_other,
@@ -115,6 +156,25 @@  long long ct_timeout_val[] = {
 #undef CT_TIMEOUT
 };
 
+#define FTP_MAX_PORT 65535
+#define CT_ALG_EXP_TIMEOUT (30 * 1000)
+#define LARGEST_FTP_MSG_OF_INTEREST 128
+#define FTP_PORT_CMD "PORT"
+#define FTP_PORT_CMD_SIZE 4
+#define FTP_PASV_REPLY_CODE "227"
+#define FTP_PASV_REPLY_CODE_SIZE 3
+#define MAX_FTP_DGTS 3
+
+#define FTP_EPRT_CMD "EPRT"
+#define FTP_EPRT_CMD_SIZE 4
+/* XXX: This may be a bit inflexible. */
+#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
+#define FTP_EPSV_REPLY_SIZE 16
+#define MAX_EXT_FTP_DGTS 5
+#define FTP_AF_V6 '2'
+
+#define ALG_WC_SRC_PORT 0
+
 /* If the total number of connections goes above this value, no new connections
  * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
 #define DEFAULT_N_CONN_LIMIT 3000000
@@ -127,10 +187,12 @@  conntrack_init(struct conntrack *ct)
     unsigned i, j;
     long long now = time_msec();
 
-    ct_rwlock_init(&ct->nat_resources_lock);
-    ct_rwlock_wrlock(&ct->nat_resources_lock);
+    ct_rwlock_init(&ct->resources_lock);
+    ct_rwlock_wrlock(&ct->resources_lock);
     hmap_init(&ct->nat_conn_keys);
-    ct_rwlock_unlock(&ct->nat_resources_lock);
+    hmap_init(&ct->alg_expectations);
+    ovs_list_init(&ct->alg_exp_list);
+    ct_rwlock_unlock(&ct->resources_lock);
 
     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
         struct conntrack_bucket *ctb = &ct->buckets[i];
@@ -169,7 +231,7 @@  conntrack_destroy(struct conntrack *ct)
 
         ovs_mutex_destroy(&ctb->cleanup_mutex);
         ct_lock_lock(&ctb->lock);
-        HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
+        HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
             if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
                 atomic_count_dec(&ct->n_conn);
             }
@@ -179,14 +241,20 @@  conntrack_destroy(struct conntrack *ct)
         ct_lock_unlock(&ctb->lock);
         ct_lock_destroy(&ctb->lock);
     }
-    ct_rwlock_wrlock(&ct->nat_resources_lock);
+    ct_rwlock_wrlock(&ct->resources_lock);
     struct nat_conn_key_node *nat_conn_key_node;
     HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
         free(nat_conn_key_node);
     }
     hmap_destroy(&ct->nat_conn_keys);
-    ct_rwlock_unlock(&ct->nat_resources_lock);
-    ct_rwlock_destroy(&ct->nat_resources_lock);
+
+    struct alg_exp_node *alg_exp_node;
+    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
+        free(alg_exp_node);
+    }
+    hmap_destroy(&ct->alg_expectations);
+    ct_rwlock_unlock(&ct->resources_lock);
+    ct_rwlock_destroy(&ct->resources_lock);
 }
 
 static unsigned hash_to_bucket(uint32_t hash)
@@ -200,7 +268,7 @@  static unsigned hash_to_bucket(uint32_t hash)
 
 static void
 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
-            const struct conn_key *key)
+            const struct conn_key *key, const struct alg_exp_node *alg_exp)
 {
     pkt->md.ct_state |= CS_TRACKED;
     pkt->md.ct_zone = zone;
@@ -209,11 +277,20 @@  write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
 
     /* Use the original direction tuple if we have it. */
     if (conn) {
-        key = &conn->key;
+        if (conn->alg_related) {
+            key = &conn->master_key;
+        } else {
+            key = &conn->key;
+        }
+    } else if (alg_exp) {
+        pkt->md.ct_mark = alg_exp->master_mark;
+        pkt->md.ct_label = alg_exp->master_label;
+        key = &alg_exp->master_key;
     }
     pkt->md.ct_orig_tuple_ipv6 = false;
     if (key) {
         if (key->dl_type == htons(ETH_TYPE_IP)) {
+
             pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
                 key->src.addr.ipv4_aligned,
                 key->dst.addr.ipv4_aligned,
@@ -238,7 +315,36 @@  write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
     } else {
         memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
     }
+}
+
+static bool
+is_ftp_ctl(const struct dp_packet *pkt)
+{
+    uint8_t ip_proto;
+    struct eth_header *l2 = dp_packet_eth(pkt);
+    if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
+        struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
+        ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
+    } else {
+        struct ip_header *l3_hdr = dp_packet_l3(pkt);
+        ip_proto = l3_hdr->ip_proto;
+    }
+
+    struct tcp_header *th = dp_packet_l4(pkt);
+
+    return (ip_proto == IPPROTO_TCP &&
+            (ntohs(th->tcp_src) == IPPORT_FTP ||
+             ntohs(th->tcp_dst) == IPPORT_FTP));
+}
 
+static void
+alg_exp_init_expiration(struct conntrack *ct,
+                        struct alg_exp_node *alg_exp_node,
+                        long long now)
+    OVS_REQ_WRLOCK(ct->resources_lock)
+{
+    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
+    ovs_list_push_back(&ct->alg_exp_list, &alg_exp_node->exp_node);
 }
 
 static void
@@ -363,8 +469,8 @@  reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
         struct ip_header *nh = dp_packet_l3(pkt);
         struct icmp_header *icmp = dp_packet_l4(pkt);
         struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
-        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
-                        -pad, &inner_l4, false);
+        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) -pad,
+                        &inner_l4, false);
 
         pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
         pkt->l4_ofs += inner_l4 - (char *) icmp;
@@ -459,7 +565,7 @@  un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
  * and a hash would have already been needed. Hence, this function
  * is just intended for code clarity. */
 static struct conn *
-conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
+conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
 {
     struct conn_lookup_ctx ctx;
     ctx.conn = NULL;
@@ -471,21 +577,36 @@  conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
 }
 
 static void
+conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
+                  long long now, int seq_skew, bool seq_skew_dir)
+{
+    uint32_t hash = conn_key_hash(key, ct->hash_basis);
+    unsigned bucket = hash_to_bucket(hash);
+    ct_lock_lock(&ct->buckets[bucket].lock);
+    struct conn *conn = conn_lookup(ct, key, now);
+    if (conn && seq_skew) {
+        conn->seq_skew = seq_skew;
+        conn->seq_skew_dir = seq_skew_dir;
+    }
+    ct_lock_unlock(&ct->buckets[bucket].lock);
+}
+
+static void
 nat_clean(struct conntrack *ct, struct conn *conn,
           struct conntrack_bucket *ctb)
     OVS_REQUIRES(ctb->lock)
 {
     long long now = time_msec();
-    ct_rwlock_wrlock(&ct->nat_resources_lock);
+    ct_rwlock_wrlock(&ct->resources_lock);
     nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
-    ct_rwlock_unlock(&ct->nat_resources_lock);
+    ct_rwlock_unlock(&ct->resources_lock);
     ct_lock_unlock(&ctb->lock);
 
     uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
     unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
 
     ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
-    ct_rwlock_wrlock(&ct->nat_resources_lock);
+    ct_rwlock_wrlock(&ct->resources_lock);
 
     struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
 
@@ -504,7 +625,7 @@  nat_clean(struct conntrack *ct, struct conn *conn,
     }
     delete_conn(conn);
 
-    ct_rwlock_unlock(&ct->nat_resources_lock);
+    ct_rwlock_unlock(&ct->resources_lock);
     ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
     ct_lock_lock(&ctb->lock);
 }
@@ -528,7 +649,10 @@  static struct conn *
 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
                struct conn_lookup_ctx *ctx, bool commit, long long now,
                const struct nat_action_info_t *nat_action_info,
-               struct conn *conn_for_un_nat_copy)
+               struct conn *conn_for_un_nat_copy,
+               const char *helper,
+               const struct alg_exp_node *alg_exp,
+               struct ct_addr alg_nat_repl_addr)
 {
     unsigned bucket = hash_to_bucket(ctx->hash);
     struct conn *nc = NULL;
@@ -538,6 +662,9 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
         return nc;
     }
     pkt->md.ct_state = CS_NEW;
+    if (alg_exp) {
+        pkt->md.ct_state |= CS_RELATED;
+    }
 
     if (commit) {
         unsigned int n_conn_limit;
@@ -554,34 +681,50 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
         nc->rev_key = nc->key;
         conn_key_reverse(&nc->rev_key);
 
+        if (helper) {
+            nc->alg = xstrdup(helper);
+        }
+
+        if (alg_exp) {
+            nc->alg_related = true;
+            nc->mark = alg_exp->master_mark;
+            nc->label = alg_exp->master_label;
+            nc->master_key = alg_exp->master_key;
+        }
+
         if (nat_action_info) {
             nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
-            ct_rwlock_wrlock(&ct->nat_resources_lock);
-
-            bool nat_res = nat_select_range_tuple(ct, nc,
-                                                  conn_for_un_nat_copy);
 
-            if (!nat_res) {
-                free(nc->nat_info);
-                nc->nat_info = NULL;
-                free (nc);
-                ct_rwlock_unlock(&ct->nat_resources_lock);
-                return NULL;
-            }
+            if (alg_exp) {
+                nc->rev_key.src.addr = alg_nat_repl_addr;
+                nc->nat_info->nat_action = NAT_ACTION_DST;
+                *conn_for_un_nat_copy = *nc;
+            } else {
+                ct_rwlock_wrlock(&ct->resources_lock);
+                bool nat_res = nat_select_range_tuple(
+                                   ct, nc, conn_for_un_nat_copy);
+
+                if (!nat_res) {
+                    free(nc->nat_info);
+                    nc->nat_info = NULL;
+                    free (nc);
+                    ct_rwlock_unlock(&ct->resources_lock);
+                    return NULL;
+                }
 
-            if (conn_for_un_nat_copy &&
-                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
                 *nc = *conn_for_un_nat_copy;
-                conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
-                conn_for_un_nat_copy->nat_info = NULL;
+                ct_rwlock_unlock(&ct->resources_lock);
             }
-            ct_rwlock_unlock(&ct->nat_resources_lock);
-
-            nat_packet(pkt, nc, ctx->related);
+            conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
+            conn_for_un_nat_copy->nat_info = NULL;
+            conn_for_un_nat_copy->alg = NULL;
+            nat_packet(pkt, nc, ctx->related ||
+                       pkt->md.ct_state & CS_RELATED);
         }
         hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
         atomic_count_inc(&ct->n_conn);
     }
+
     return nc;
 }
 
@@ -599,6 +742,9 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
             pkt->md.ct_state |= CS_REPLY_DIR;
         }
     } else {
+        if ((*conn)->alg_related) {
+            pkt->md.ct_state |= CS_RELATED;
+        }
         enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
                                              pkt, ctx->reply, now);
 
@@ -626,7 +772,7 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
 
 static void
 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
-                   long long now)
+                   long long now, bool alg_un_nat)
 {
     struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
     nc->key = conn_for_un_nat_copy->rev_key;
@@ -634,22 +780,26 @@  create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
     uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
     unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
     ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
-    ct_rwlock_rdlock(&ct->nat_resources_lock);
-
     struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
 
-    struct nat_conn_key_node *nat_conn_key_node =
-        nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
-    if (nat_conn_key_node
-        && !memcmp(&nat_conn_key_node->value, &nc->rev_key,
-                   sizeof nat_conn_key_node->value)
-        && !rev_conn) {
+    if (alg_un_nat) {
         hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
                     &nc->node, un_nat_hash);
     } else {
-        free(nc);
+        ct_rwlock_rdlock(&ct->resources_lock);
+
+        struct nat_conn_key_node *nat_conn_key_node =
+            nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
+        if (nat_conn_key_node && !memcmp(&nat_conn_key_node->value,
+            &nc->rev_key, sizeof nat_conn_key_node->value) && !rev_conn) {
+
+            hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
+                        &nc->node, un_nat_hash);
+        } else {
+            free(nc);
+        }
+        ct_rwlock_unlock(&ct->resources_lock);
     }
-    ct_rwlock_unlock(&ct->nat_resources_lock);
     ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
 }
 
@@ -661,6 +811,7 @@  handle_nat(struct dp_packet *pkt, struct conn *conn,
         (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
           (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
            zone != pkt->md.ct_zone))) {
+
         if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
             pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
         }
@@ -742,7 +893,8 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
             struct conn_lookup_ctx *ctx, uint16_t zone,
             bool force, bool commit, long long now, const uint32_t *setmark,
             const struct ovs_key_ct_labels *setlabel,
-            const struct nat_action_info_t *nat_action_info)
+            const struct nat_action_info_t *nat_action_info,
+            const char *helper)
 {
     struct conn *conn;
     unsigned bucket = hash_to_bucket(ctx->hash);
@@ -789,27 +941,64 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
     struct conn conn_for_un_nat_copy;
     conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
     if (OVS_LIKELY(conn)) {
-        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
+        if (is_ftp_ctl(pkt)) {
+            if (ctx->reply != conn->seq_skew_dir) {
+                handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
+                               !!nat_action_info);
+                create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
+                                                    bucket);
+            } else {
+                create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
+                                                    bucket);
+                handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
+                               !!nat_action_info);
+            }
+        } else {
+            create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
+                                                bucket);
+        }
         if (nat_action_info && !create_new_conn) {
             handle_nat(pkt, conn, zone, ctx->reply, ctx->related);
         }
-    } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
-                                nat_action_info)) {
-        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
+
+    }else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
+                               nat_action_info)) {
+        create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
+                                            bucket);
     } else {
         if (ctx->related) {
+            /* An icmp related conn should always be found; no new
+               connection is created based on an icmp related packet. */
             pkt->md.ct_state = CS_INVALID;
         } else {
             create_new_conn = true;
         }
     }
 
+    const struct alg_exp_node *alg_exp = NULL;
     if (OVS_UNLIKELY(create_new_conn)) {
+        struct ct_addr alg_nat_repl_addr;
+        memset(&alg_nat_repl_addr, 0, sizeof alg_nat_repl_addr);
+        struct alg_exp_node alg_exp_entry;
+        if (!is_ftp_ctl(pkt)) {
+           ct_rwlock_rdlock(&ct->resources_lock);
+           alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
+                                        ct->hash_basis);
+           if (alg_exp) {
+               alg_nat_repl_addr = alg_exp->alg_nat_repl_addr;
+               memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
+               alg_exp = &alg_exp_entry;
+           }
+           ct_rwlock_unlock(&ct->resources_lock);
+        }
+
         conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
-                              &conn_for_un_nat_copy);
+                              &conn_for_un_nat_copy, helper, alg_exp,
+                              alg_nat_repl_addr);
     }
 
-    write_ct_md(pkt, zone, conn, &ctx->key);
+    write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
+
     if (conn && setmark) {
         set_mark(pkt, conn, setmark[0], setmark[1]);
     }
@@ -818,10 +1007,21 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
         set_label(pkt, conn, &setlabel[0], &setlabel[1]);
     }
 
+    struct conn conn_for_expectation;
+    if (conn && (is_ftp_ctl(pkt))) {
+        conn_for_expectation = *conn;
+    }
+
     ct_lock_unlock(&ct->buckets[bucket].lock);
 
     if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
-        create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
+        create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
+    }
+
+    /* FTP control packet handling with expectation creation. */
+    if (OVS_UNLIKELY(conn && is_ftp_ctl(pkt))) {
+        handle_ftp_ctl(ct, ctx, pkt, &conn_for_expectation,
+                       now, CT_FTP_CTL_INTEREST, !!nat_action_info);
     }
 }
 
@@ -841,26 +1041,21 @@  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
                   const char *helper,
                   const struct nat_action_info_t *nat_action_info)
 {
+
     struct dp_packet **pkts = pkt_batch->packets;
     size_t cnt = pkt_batch->count;
-    long long now = time_msec();
     struct conn_lookup_ctx ctx;
+    long long now = time_msec();
+    size_t i = 0;
 
-    if (helper) {
-        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
-
-        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
-        /* Continue without the helper */
-    }
-
-    for (size_t i = 0; i < cnt; i++) {
+    for (i = 0; i < cnt; i++) {
         if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
             pkts[i]->md.ct_state = CS_INVALID;
-            write_ct_md(pkts[i], zone, NULL, NULL);
+            write_ct_md(pkts[i], zone, NULL, NULL, NULL);
             continue;
         }
         process_one(ct, pkts[i], &ctx, zone, force, commit,
-                    now, setmark, setlabel, nat_action_info);
+                    now, setmark, setlabel, nat_action_info, helper);
     }
 
     return 0;
@@ -869,8 +1064,12 @@  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
 static void
 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
 {
-    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
-    conn->mark = pkt->md.ct_mark;
+    if (conn->alg_related) {
+        pkt->md.ct_mark = conn->mark;
+    } else {
+        pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
+        conn->mark = pkt->md.ct_mark;
+    }
 }
 
 static void
@@ -878,16 +1077,20 @@  set_label(struct dp_packet *pkt, struct conn *conn,
           const struct ovs_key_ct_labels *val,
           const struct ovs_key_ct_labels *mask)
 {
-    ovs_u128 v, m;
+    if (conn->alg_related) {
+        pkt->md.ct_label = conn->label;
+    } else {
+        ovs_u128 v, m;
 
-    memcpy(&v, val, sizeof v);
-    memcpy(&m, mask, sizeof m);
+        memcpy(&v, val, sizeof v);
+        memcpy(&m, mask, sizeof m);
 
-    pkt->md.ct_label.u64.lo = v.u64.lo
+        pkt->md.ct_label.u64.lo = v.u64.lo
                               | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
-    pkt->md.ct_label.u64.hi = v.u64.hi
+        pkt->md.ct_label.u64.hi = v.u64.hi
                               | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
-    conn->label = pkt->md.ct_label;
+        conn->label = pkt->md.ct_label;
+    }
 }
 
 
@@ -896,8 +1099,8 @@  set_label(struct dp_packet *pkt, struct conn *conn,
  * LLONG_MAX if 'ctb' is empty.  The return value might be smaller than 'now',
  * if 'limit' is reached */
 static long long
-sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
-             size_t limit)
+sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
+             long long now, size_t limit)
     OVS_REQUIRES(ctb->lock)
 {
     struct conn *conn, *next;
@@ -923,6 +1126,28 @@  sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
         }
     }
 
+#define MAX_ALG_EXP_TO_EXPIRE 1000
+    size_t alg_exp_count = hmap_count(&ct->alg_expectations);
+    /* XXX: revisit this. */
+    size_t max_to_expire =
+        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);
+    count = 0;
+    ct_rwlock_wrlock(&ct->resources_lock);
+    struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
+    LIST_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
+                        exp_node, &ct->alg_exp_list) {
+        if (now < alg_exp_node->expiration ||
+            count >= max_to_expire) {
+            min_expiration = MIN(min_expiration, alg_exp_node->expiration);
+            break;
+        }
+        ovs_list_remove(&alg_exp_node->exp_node);
+        hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
+        free(alg_exp_node);
+        count++;
+    }
+    ct_rwlock_unlock(&ct->resources_lock);
+
     return min_expiration;
 }
 
@@ -1802,7 +2027,8 @@  nat_conn_keys_lookup(struct hmap *nat_conn_keys,
 }
 
 static void
-nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
+nat_conn_keys_remove(struct hmap *nat_conn_keys,
+                     const struct conn_key *key,
                      uint32_t basis)
 {
     struct nat_conn_key_node *nat_conn_key_node;
@@ -1819,6 +2045,88 @@  nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
     }
 }
 
+static struct alg_exp_node *
+expectation_lookup(struct hmap *alg_expectations,
+                   const struct conn_key *key,
+                   uint32_t basis)
+{
+    struct conn_key check_key = *key;
+    check_key.src.port = ALG_WC_SRC_PORT;
+    struct alg_exp_node *alg_exp_node;
+    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
+    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
+                             alg_exp_conn_key_hash,
+                             alg_expectations) {
+        if (!memcmp(&alg_exp_node->key, &check_key,
+                    sizeof alg_exp_node->key)) {
+            return alg_exp_node;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+expectation_create(struct conntrack *ct,
+                   ovs_be16 dst_port,
+                   const long long now,
+                   enum ct_alg_mode mode,
+                   const struct conn *master_conn)
+{
+    struct ct_addr src_addr;
+    struct ct_addr dst_addr;
+    struct ct_addr alg_nat_repl_addr;
+
+    switch (mode) {
+    case CT_FTP_MODE_ACTIVE:
+        src_addr = master_conn->rev_key.src.addr;
+        dst_addr = master_conn->rev_key.dst.addr;
+        alg_nat_repl_addr = master_conn->key.src.addr;
+        break;
+    case CT_FTP_MODE_PASSIVE:
+        src_addr = master_conn->key.src.addr;
+        dst_addr = master_conn->key.dst.addr;
+        alg_nat_repl_addr = master_conn->rev_key.src.addr;
+        break;
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    struct alg_exp_node *alg_exp_node =
+        xzalloc(sizeof *alg_exp_node);
+    alg_exp_node->key.dl_type = master_conn->key.dl_type;
+    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
+    alg_exp_node->key.zone = master_conn->key.zone;
+    alg_exp_node->key.src.addr = src_addr;
+    alg_exp_node->key.dst.addr = dst_addr;
+    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
+    alg_exp_node->key.dst.port = dst_port;
+    alg_exp_node->master_mark = master_conn->mark;
+    alg_exp_node->master_label = master_conn->label;
+    alg_exp_node->master_key = master_conn->key;
+    ct_rwlock_rdlock(&ct->resources_lock);
+    struct alg_exp_node *alg_exp = expectation_lookup(
+        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
+    ct_rwlock_unlock(&ct->resources_lock);
+    if (alg_exp) {
+        free(alg_exp_node);
+        return;
+    }
+
+    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
+    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
+    uint32_t alg_exp_conn_key_hash =
+        conn_key_hash(&alg_exp_node->key,
+                      ct->hash_basis);
+    ct_rwlock_wrlock(&ct->resources_lock);
+    hmap_insert(&ct->alg_expectations,
+                &alg_exp_node->node,
+                alg_exp_conn_key_hash);
+
+    alg_exp_init_expiration(ct, alg_exp_node, now);
+    ct_rwlock_unlock(&ct->resources_lock);
+}
+
 static void
 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
                 long long now)
@@ -1886,6 +2194,7 @@  static void
 delete_conn(struct conn *conn)
 {
     free(conn->nat_info);
+    free(conn->alg);
     free(conn);
 }
 
@@ -1950,6 +2259,10 @@  conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
     if (class->conn_get_protoinfo) {
         class->conn_get_protoinfo(conn, &entry->protoinfo);
     }
+
+    if (conn->alg) {
+        entry->helper.name = xstrdup(conn->alg);
+    }
 }
 
 int
@@ -2020,7 +2333,7 @@  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
         struct conn *conn, *next;
 
         ct_lock_lock(&ct->buckets[i].lock);
-        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
+        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
             if ((!zone || *zone == conn->key.zone) &&
                 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
                 conn_clean(ct, conn, &ct->buckets[i]);
@@ -2028,5 +2341,550 @@  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
         }
         ct_lock_unlock(&ct->buckets[i].lock);
     }
+
+    ct_rwlock_wrlock(&ct->resources_lock);
+    struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
+    HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
+                       node, &ct->alg_expectations) {
+        if (!zone || *zone == alg_exp_node->key.zone) {
+            ovs_list_remove(&alg_exp_node->exp_node);
+            hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
+            free(alg_exp_node);
+        }
+    }
+    ct_rwlock_unlock(&ct->resources_lock);
     return 0;
 }
+
+static uint8_t
+get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
+{
+    return v4_addr >> (index * 8) & 0xff;
+}
+
+static void
+replace_substring(char *substr, uint8_t substr_size,
+                  uint8_t total_size, char *rep_str,
+                  uint8_t rep_str_size)
+{
+    char delta = rep_str_size - substr_size;
+    size_t move_size = total_size - substr_size;
+    char *remain_substring = substr + substr_size;
+    memmove(remain_substring + delta, remain_substring,
+            move_size);
+    memcpy(substr, rep_str, rep_str_size);
+}
+
+static int
+repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
+                 char *ftp_data_start,
+                 size_t addr_offset_from_ftp_data_start)
+{
+#define MAX_FTP_V4_NAT_DELTA 8
+    int overall_delta = 0;
+    char *byte_str = ftp_data_start + addr_offset_from_ftp_data_start;
+    char *next_delim;
+    size_t substr_size;
+    uint8_t rep_byte;
+    char rep_str[4];
+    size_t replace_size;
+    uint8_t i;
+    int rc;
+    uint32_t orig_used_size = dp_packet_size(pkt);
+    uint16_t allocated_size = dp_packet_get_allocated(pkt);
+
+    /* Do conservative check for pathological MTU usage. */
+    if (orig_used_size + MAX_FTP_V4_NAT_DELTA > allocated_size) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
+                     allocated_size);
+        return 0;
+    }
+
+    size_t remain_size = tcp_payload_length(pkt) -
+                             addr_offset_from_ftp_data_start;
+
+    for (i = 0; i < 4; i++) {
+        memset(rep_str, 0 , sizeof rep_str);
+        next_delim = memchr(byte_str,',',4);
+        ovs_assert(next_delim);
+        substr_size = next_delim - byte_str;
+        remain_size -= substr_size;
+        rep_byte = get_v4_byte_be(v4_addr_rep, i);
+        rc = sprintf(rep_str, "%d", rep_byte);
+        ovs_assert(rc > 0 && rc <= 3);
+        replace_size = strlen(rep_str);
+        replace_substring(byte_str, substr_size, remain_size,
+                          rep_str, replace_size);
+
+        overall_delta += (int) strlen(rep_str) - (int) substr_size;
+        byte_str += replace_size + 1;
+    }
+
+    dp_packet_set_size(pkt, MAX(orig_used_size + overall_delta, 64));
+    return overall_delta;
+}
+
+static char *
+skip_non_digits(char *str)
+{
+    while ((!isdigit(*str)) && (*str != 0)) {
+        str++;
+    }
+    return str;
+}
+
+static char *
+delinate_number(char *str, uint8_t max_digits)
+{
+    uint8_t digits_found = 0;
+    while (isdigit(*str) && digits_found <= max_digits) {
+        str++;
+        digits_found++;
+    }
+
+    *str = 0;
+    return str;
+}
+
+static enum ftp_ctl_pkt
+detect_ftp_ctl_mode(struct conn_lookup_ctx *ctx, struct dp_packet *pkt,
+                    char *ftp_msg)
+{
+    struct tcp_header *th = dp_packet_l4(pkt);
+    char *tcp_hdr = (char *) th;
+    uint32_t tcp_payload_len = tcp_payload_length(pkt);
+    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
+
+    size_t tcp_payload_of_interest = MIN(tcp_payload_len,
+                                         LARGEST_FTP_MSG_OF_INTEREST);
+
+    ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
+                tcp_payload_of_interest);
+
+    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+        if (strncasecmp(ftp_msg, FTP_EPRT_CMD, FTP_EPRT_CMD_SIZE) &&
+            strncasecmp(ftp_msg, FTP_EPSV_REPLY, FTP_EPSV_REPLY_SIZE)) {
+            return CT_FTP_CTL_OTHER;
+        }
+    } else {
+        if (strncasecmp(ftp_msg, FTP_PORT_CMD, FTP_PORT_CMD_SIZE) &&
+            strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
+                        FTP_PASV_REPLY_CODE_SIZE)) {
+            return CT_FTP_CTL_OTHER;
+        }
+    }
+
+    return CT_FTP_CTL_INTEREST;
+}
+
+static enum ftp_ctl_pkt
+process_ftp_ctl_v4(struct conntrack *ct,
+                   struct conn_lookup_ctx *ctx,
+                   struct dp_packet *pkt,
+                   const struct conn *conn_for_expectation,
+                   long long now, ovs_be32 *v4_addr_rep,
+                   char **ftp_data_v4_start,
+                   size_t *addr_offset_from_ftp_data_start)
+{
+
+    struct tcp_header *th = dp_packet_l4(pkt);
+    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
+    char *tcp_hdr = (char *) th;
+    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
+    char *ftp = ftp_msg;
+    enum ct_alg_mode mode;
+
+     detect_ftp_ctl_mode(ctx, pkt, ftp_msg);
+
+    *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
+
+    if (!strncasecmp(ftp_msg, FTP_PORT_CMD, FTP_PORT_CMD_SIZE)) {
+        ftp = ftp_msg + FTP_PORT_CMD_SIZE;
+        mode = CT_FTP_MODE_ACTIVE;
+    } else {
+        ftp = ftp_msg + FTP_PASV_REPLY_CODE_SIZE;
+        mode = CT_FTP_MODE_PASSIVE;
+    }
+
+    /* Find first space. */
+    while ((*ftp != ' ') && (*ftp != 0)) {
+        ftp++;
+    }
+    if (*ftp != ' ') {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    /* Find the first digit, after space. */
+    ftp = skip_non_digits(ftp);
+    if (*ftp == 0) {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    char *ip_addr_start = ftp;
+    *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
+    uint8_t comma_count = 0;
+
+    while ((comma_count < 4) && (*ftp != 0)) {
+        if (*ftp == ',') {
+            comma_count ++;
+            if (comma_count == 4) {
+                *ftp = 0;
+            } else {
+                *ftp = '.';
+            }
+        }
+        ftp++;
+    }
+    if (comma_count != 4) {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    struct in_addr ip_addr;
+    int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
+    if (rc2 != 1) {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    char *save_ftp = ftp;
+    ftp = delinate_number(ftp, MAX_FTP_DGTS);
+    if (!ftp) {
+        return CT_FTP_CTL_INVALID;
+    }
+    int value;
+    if (!str_to_int(save_ftp, 10, &value)) {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    uint16_t port_hs = (uint16_t) value;
+    /* This is derived from the L4 port maximum is 65535. */
+    if (value > 255) {
+        return CT_FTP_CTL_INVALID;
+    }
+    port_hs <<= 8;
+
+    /* Skip over comma. */
+    ftp++;
+    save_ftp = ftp;
+    bool digit_found = false;
+    while (isdigit(*ftp)) {
+        ftp++;
+        digit_found = true;
+    }
+    if (!digit_found) {
+        return CT_FTP_CTL_INVALID;
+    }
+    *ftp = 0;
+    if (!str_to_int(save_ftp, 10, &value)) {
+        return CT_FTP_CTL_INVALID;
+    }
+    uint16_t port_lo_hs = (uint16_t) value;
+    if (65535 - port_hs < port_lo_hs) {
+        return CT_FTP_CTL_INVALID;
+    }
+    port_hs |= port_lo_hs;
+    ovs_be16 port = (OVS_FORCE ovs_be16) htons(port_hs);
+    ovs_be32 conn_ipv4_addr;
+
+    switch (mode) {
+    case CT_FTP_MODE_ACTIVE:
+        *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4_aligned;
+        conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4_aligned;
+        break;
+    case CT_FTP_MODE_PASSIVE:
+        *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4_aligned;
+        conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4_aligned;
+        break;
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    ovs_be32 ftp_ipv4_addr;
+    memcpy(&ftp_ipv4_addr, &ip_addr.s_addr, sizeof ftp_ipv4_addr);
+    /* Although most servers will block this exploit, there may be some
+     * less well managed. */
+    if ((ftp_ipv4_addr != conn_ipv4_addr) &&
+        (ftp_ipv4_addr != *v4_addr_rep)) {
+        return CT_FTP_CTL_INVALID;
+    }
+
+    expectation_create(ct, port, now, mode, conn_for_expectation);
+    return CT_FTP_CTL_INTEREST;
+}
+
+static char *
+skip_ipv6_digits(char *str)
+{
+    while ((isxdigit(*str)) || (*str == ':')) {
+        str++;
+    }
+    return str;
+}
+
+static enum ftp_ctl_pkt
+process_ftp_ctl_v6(struct conntrack *ct,
+                   struct conn_lookup_ctx *ctx,
+                   struct dp_packet *pkt,
+                   const struct conn *conn_for_expectation,
+                   long long now,
+                   struct ct_addr *v6_addr_rep,
+                   char **ftp_data_start,
+                   size_t *addr_offset_from_ftp_data_start,
+                   size_t *addr_size, enum ct_alg_mode *mode)
+{
+    struct tcp_header *th = dp_packet_l4(pkt);
+    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
+    char *tcp_hdr = (char *) th;
+    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
+    char *ftp = ftp_msg;
+    struct in6_addr ip6_addr;
+
+     detect_ftp_ctl_mode(ctx, pkt, ftp_msg);
+
+    *ftp_data_start = tcp_hdr + tcp_hdr_len;
+
+    if (!strncasecmp(ftp_msg, FTP_EPRT_CMD, FTP_EPRT_CMD_SIZE)) {
+        ftp = ftp_msg + FTP_EPRT_CMD_SIZE;
+        ftp = skip_non_digits(ftp);
+        if ((*ftp != FTP_AF_V6) || isdigit(*(ftp + 1))) {
+            return CT_FTP_CTL_INVALID;
+        }
+        /* Jump over delimiter. */
+        ftp += 2;
+
+        char *ip_addr_start = ftp;
+        memset(&ip6_addr, 0, sizeof ip6_addr);
+        *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
+        ftp = skip_ipv6_digits(ftp);
+        *ftp = 0;
+        *addr_size = ftp - ip_addr_start;
+        int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
+        if (rc2 != 1) {
+            return CT_FTP_CTL_INVALID;
+        }
+        ftp++;
+        *mode = CT_FTP_MODE_ACTIVE;
+    } else {
+        ftp = ftp_msg + strcspn(ftp_msg, "(");
+        ftp = skip_non_digits(ftp);
+        if (!isdigit(*ftp)) {
+            return CT_FTP_CTL_INVALID;
+        }
+
+        /* Not used for passive mode. */
+        *addr_offset_from_ftp_data_start = 0;
+        *addr_size = 0;
+
+        *mode = CT_FTP_MODE_PASSIVE;
+    }
+
+    char *save_ftp = ftp;
+    ftp = delinate_number(ftp , MAX_EXT_FTP_DGTS);
+    if (!ftp) {
+        return CT_FTP_CTL_INVALID;
+    }
+    int value;
+    if (!str_to_int(save_ftp, 10, &value)) {
+        return CT_FTP_CTL_INVALID;
+    }
+    if (value > FTP_MAX_PORT) {
+        return CT_FTP_CTL_INVALID;
+    }
+    uint16_t port_hs = (uint16_t) value;
+    ovs_be16 port = (OVS_FORCE ovs_be16) htons(port_hs);
+
+    switch (*mode) {
+    case CT_FTP_MODE_ACTIVE:
+        *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
+        /* Although most servers will block this exploit, there may be some
+         * less well managed. */
+        if (memcmp(&ip6_addr, &v6_addr_rep->ipv6_aligned, sizeof ip6_addr) &&
+            memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6_aligned,
+                   sizeof ip6_addr)) {
+            return CT_FTP_CTL_INVALID;
+        }
+        break;
+    case CT_FTP_MODE_PASSIVE:
+        *v6_addr_rep = conn_for_expectation->key.dst.addr;
+        break;
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    expectation_create(ct, port, now, *mode, conn_for_expectation);
+    return CT_FTP_CTL_INTEREST;
+}
+
+static int
+repl_ftp_v6_addr(struct dp_packet *pkt, struct ct_addr v6_addr_rep,
+                 char *ftp_data_start,
+                 size_t addr_offset_from_ftp_data_start,
+                 size_t addr_size, enum ct_alg_mode mode)
+{
+/* This is slightly bigger than really possible. */
+#define MAX_FTP_V6_NAT_DELTA 45
+    int overall_delta = 0;
+    char *pkt_addr_str = ftp_data_start + addr_offset_from_ftp_data_start;
+    size_t replace_addr_size;
+    uint32_t orig_used_size = dp_packet_size(pkt);
+    uint16_t allocated_size = dp_packet_get_allocated(pkt);
+    char v6_addr_str[IPV6_SCAN_LEN] = {0};
+    const char *rc;
+
+    if (mode == CT_FTP_MODE_PASSIVE) {
+        return 0;
+    }
+
+    /* Do conservative check for pathological MTU usage. */
+    if (orig_used_size + MAX_FTP_V6_NAT_DELTA > allocated_size) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
+                     allocated_size);
+        return 0;
+    }
+
+    rc = inet_ntop(AF_INET6, &v6_addr_rep.ipv6_aligned, v6_addr_str,
+              IPV6_SCAN_LEN - 1);
+    ovs_assert(rc != NULL);
+
+    replace_addr_size = strlen(v6_addr_str);
+
+    size_t remain_size = tcp_payload_length(pkt) -
+                             addr_offset_from_ftp_data_start;
+
+    replace_substring(pkt_addr_str, addr_size, remain_size,
+                      v6_addr_str, replace_addr_size);
+
+    overall_delta += (int) replace_addr_size - (int) addr_size;
+
+    dp_packet_set_size(pkt, MAX(orig_used_size + overall_delta, 64));
+    return overall_delta;
+}
+
+static void
+handle_ftp_ctl(struct conntrack *ct, struct conn_lookup_ctx *ctx,
+               struct dp_packet *pkt,
+               const struct conn *conn_for_expectation,
+               long long now, enum ftp_ctl_pkt ftp_ctl, bool nat)
+{
+    struct ip_header *l3_hdr = dp_packet_l3(pkt);
+    ovs_be32 v4_addr_rep = 0;
+    struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
+    struct tcp_header *th = dp_packet_l4(pkt);
+    struct ct_addr v6_addr_rep;
+    size_t addr_offset_from_ftp_data_start;
+    int64_t seq_skew = 0;
+    bool seq_skew_dir;
+    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
+    size_t addr_size = 0;
+    char *ftp_data_start;
+    bool do_seq_skew_adj = true;
+    enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
+
+    if (detect_ftp_ctl_mode(ctx, pkt, ftp_msg) != ftp_ctl) {
+        return;
+    }
+
+    if (!nat || ftp_ctl == CT_FTP_CTL_INTEREST) {
+        do_seq_skew_adj = false;
+    }
+
+    if (ftp_ctl == CT_FTP_CTL_OTHER) {
+        seq_skew = conn_for_expectation->seq_skew;
+        seq_skew_dir = conn_for_expectation->seq_skew_dir;
+    } else if (ftp_ctl == CT_FTP_CTL_INTEREST) {
+        enum ftp_ctl_pkt rc;
+        if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+            rc = process_ftp_ctl_v6(ct, ctx, pkt, conn_for_expectation,
+                                    now, &v6_addr_rep, &ftp_data_start,
+                                    &addr_offset_from_ftp_data_start,
+                                    &addr_size, &mode);
+        } else {
+            rc = process_ftp_ctl_v4(ct, ctx, pkt, conn_for_expectation,
+                                    now, &v4_addr_rep, &ftp_data_start,
+                                    &addr_offset_from_ftp_data_start);
+        }
+        if (rc == CT_FTP_CTL_INVALID) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+            VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
+            pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
+            return;
+        } else if (rc == CT_FTP_CTL_INTEREST) {
+            uint16_t ip_len;
+            if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+                seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep, ftp_data_start,
+                                            addr_offset_from_ftp_data_start,
+                                            addr_size, mode);
+                seq_skew_dir = ctx->reply;
+                if (seq_skew) {
+                    ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen);
+                    ip_len += seq_skew;
+                    nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
+                    conn_seq_skew_set(ct, &conn_for_expectation->key, now,
+                                      seq_skew, seq_skew_dir);
+                }
+            } else {
+                seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep, ftp_data_start,
+                                            addr_offset_from_ftp_data_start);
+                seq_skew_dir = ctx->reply;
+                ip_len = ntohs(l3_hdr->ip_tot_len);
+                if (seq_skew) {
+                    ip_len += seq_skew;
+                    l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
+                                          l3_hdr->ip_tot_len, htons(ip_len));
+                    l3_hdr->ip_tot_len = htons(ip_len);
+                    conn_seq_skew_set(ct, &conn_for_expectation->key, now,
+                                      seq_skew, seq_skew_dir);
+                }
+            }
+        } else {
+            OVS_NOT_REACHED();
+        }
+    } else {
+        OVS_NOT_REACHED();
+    }
+
+    if (do_seq_skew_adj && seq_skew != 0) {
+        if (ctx->reply != conn_for_expectation->seq_skew_dir) {
+
+            uint32_t tcp_ack = ntohl(get_16aligned_be32(&th->tcp_ack));
+
+            if ((seq_skew > 0) && (tcp_ack < seq_skew)) {
+                /* Should not be possible; will be marked invalid. */
+                tcp_ack = 0;
+            } else if ((seq_skew < 0) && (UINT32_MAX - tcp_ack < -seq_skew)) {
+                tcp_ack = (-seq_skew) - (UINT32_MAX - tcp_ack);
+            } else {
+                tcp_ack -= seq_skew;
+            }
+            ovs_be32 new_tcp_ack = (OVS_FORCE ovs_be32) (htonl(tcp_ack));
+            put_16aligned_be32(&th->tcp_ack, new_tcp_ack);
+        } else {
+            uint32_t tcp_seq = ntohl(get_16aligned_be32(&th->tcp_seq));
+            if ((seq_skew > 0) && (UINT32_MAX - tcp_seq < seq_skew)) {
+                tcp_seq = seq_skew - (UINT32_MAX - tcp_seq);
+            } else if ((seq_skew < 0) && (tcp_seq < -seq_skew)) {
+                /* Should not be possible; will be marked invalid. */
+                tcp_seq = 0;
+            } else {
+                tcp_seq += seq_skew;
+            }
+            ovs_be32 new_tcp_seq = (OVS_FORCE ovs_be32) (htonl(tcp_seq));
+            put_16aligned_be32(&th->tcp_seq, new_tcp_seq);
+        }
+    }
+
+    const char *tail = dp_packet_tail(pkt);
+    uint8_t pad = dp_packet_l2_pad_size(pkt);
+    th->tcp_csum = 0;
+    uint32_t tcp_csum;
+    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+        tcp_csum = packet_csum_pseudoheader6(nh6);
+    } else {
+        tcp_csum = packet_csum_pseudoheader(l3_hdr);
+    }
+    th->tcp_csum = csum_finish(
+        csum_continue(tcp_csum, th, tail - (char *) th - pad));
+    return;
+}
+
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 243aebb..0b110c7 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -267,11 +267,13 @@  struct conntrack {
     /* The following resources are referenced during nat connection
      * creation and deletion. */
     struct hmap nat_conn_keys OVS_GUARDED;
+    struct hmap alg_expectations OVS_GUARDED;
+    struct ovs_list alg_exp_list OVS_GUARDED;
     /* This lock is used during NAT connection creation and deletion;
      * it is taken after a bucket lock and given back before that
      * bucket unlock.
      */
-    struct ct_rwlock nat_resources_lock;
+    struct ct_rwlock resources_lock;
 
 };