diff mbox

[ovs-dev,v2,2/3] conntrack: Return NEW for IPv6 ND packets without tracking.

Message ID 20161223023619.130243-2-diproiettod@vmware.com
State Superseded
Headers show

Commit Message

Daniele Di Proietto Dec. 23, 2016, 2:36 a.m. UTC
The userspace connection tracker treats Neighbor Discovery packets
as invalid, because they're not checked against any connection.

This in inconsistent with the kernel connection tracker which always
returns 'CS_NEW'.

Therefore, this commit makes the userspace connection tracker conforming
with the kernel.  ND packets still do not create or read any state, but
they're treated as NEW.

To support this, the key extraction functions can now return
KEY_NO_TRACK, meaning that the packet is ok, but it should be treated
statelessly.

We also have to remove a test that explicitly checked that neighbor
discovery was treated as invalid.

Reported-by: Sridhar Gaddam <sgaddam@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
---
v2: Update comment to reflect that we do not do special validation with
    the packet.
---
 lib/conntrack.c         | 134 ++++++++++++++++++++++++++++++++----------------
 tests/ofproto-dpif.at   |  32 ------------
 tests/system-traffic.at |  35 +++++++++++++
 3 files changed, 125 insertions(+), 76 deletions(-)

Comments

Darrell Ball Dec. 23, 2016, 5:20 a.m. UTC | #1
Some comments inline

On 12/22/16, 6:36 PM, "Daniele Di Proietto" <diproiettod@vmware.com> wrote:

    The userspace connection tracker treats Neighbor Discovery packets
    as invalid, because they're not checked against any connection.
    
    This in inconsistent with the kernel connection tracker which always
    returns 'CS_NEW'.
    
    Therefore, this commit makes the userspace connection tracker conforming
    with the kernel.  ND packets still do not create or read any state, but
    they're treated as NEW.
    
    To support this, the key extraction functions can now return
    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated
    statelessly.


s/    To support this, the key extraction functions can now return
    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated
    statelessly.
/
    To support this, the key extraction functions can now return
    KEY_NO_TRACK, meaning the packet should be treated statelessly
    and not be sent to the connection tracker.
/


    We also have to remove a test that explicitly checked that neighbor
    discovery was treated as invalid.
    
    Reported-by: Sridhar Gaddam <sgaddam@redhat.com>
    Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
    ---
    v2: Update comment to reflect that we do not do special validation with
        the packet.
    ---
     lib/conntrack.c         | 134 ++++++++++++++++++++++++++++++++----------------
     tests/ofproto-dpif.at   |  32 ------------
     tests/system-traffic.at |  35 +++++++++++++
     3 files changed, 125 insertions(+), 76 deletions(-)
    
    diff --git a/lib/conntrack.c b/lib/conntrack.c
    index 9bea3d9..86228d6 100644
    --- a/lib/conntrack.c
    +++ b/lib/conntrack.c
    @@ -52,9 +52,17 @@ struct conn_lookup_ctx {
         bool related;
     };
     
    -static bool conn_key_extract(struct conntrack *, struct dp_packet *,
    -                             ovs_be16 dl_type, struct conn_lookup_ctx *,
    -                             uint16_t zone);
    +enum key_status {
    +    KEY_INVALID,   /* Could not extract the connection key: invalid. */
    +    KEY_OK,        /* Connection key is ok. */
    +    KEY_NO_TRACK,  /* Connection key is ok, but it should not be tracked. */


KEY_NO_TRACK,  /* Connection key should not be tracked. */


    +};
    +
    +static enum key_status conn_key_extract(struct conntrack *,
    +                                        struct dp_packet *,
    +                                        ovs_be16 dl_type,
    +                                        struct conn_lookup_ctx *,
    +                                        uint16_t zone);
     static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
     static void conn_key_reverse(struct conn_key *);
     static void conn_key_lookup(struct conntrack_bucket *ctb,
    @@ -157,6 +165,20 @@ static unsigned hash_to_bucket(uint32_t hash)
         return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
     }
     
    +static uint16_t
    +key_status_to_cs(enum key_status s)
    +{
    +    switch (s) {
    +    case KEY_INVALID:
    +        return CS_INVALID;
    +    case KEY_OK:
    +    case KEY_NO_TRACK:
    +        return CS_NEW;
    +    default:
    +        OVS_NOT_REACHED();
    +    }
    +}
    +
     static void
     write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
                 uint32_t mark, ovs_u128 label)
    @@ -303,10 +325,13 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
     
         memset(bucket_list, INT8_C(-1), sizeof bucket_list);
         for (i = 0; i < cnt; i++) {
    +        enum key_status extract_res;
             unsigned bucket;
     
    -        if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
    -            write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
    +        extract_res = conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone);
    +        if (extract_res != KEY_OK) {
    +            write_ct_md(pkts[i], key_status_to_cs(extract_res), zone, 0,
    +                        OVS_U128_ZERO);
                 continue;
             }
     
    @@ -693,8 +718,11 @@ extract_l4_udp(struct conn_key *key, const void *data, size_t size)
         return key->src.port && key->dst.port;
     }
     
    -static inline bool extract_l4(struct conn_key *key, const void *data,
    -                              size_t size, bool *related, const void *l3);
    +static inline enum key_status extract_l4(struct conn_key *key,
    +                                         const void *data,
    +                                         size_t size,
    +                                         bool *related,
    +                                         const void *l3);
     
     static uint8_t
     reverse_icmp_type(uint8_t type)
    @@ -724,14 +752,14 @@ reverse_icmp_type(uint8_t type)
      * instead and set *related to true.  If 'related' is NULL we're
      * already processing a nested header and no such recursion is
      * possible */
    -static inline int
    +static inline enum key_status
     extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
                     bool *related)
     {
         const struct icmp_header *icmp = data;
     
         if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
    -        return false;
    +        return KEY_INVALID;
         }
     
         switch (icmp->icmp_type) {
    @@ -742,13 +770,15 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
         case ICMP4_INFOREQUEST:
         case ICMP4_INFOREPLY:
             if (icmp->icmp_code != 0) {
    -            return false;
    +            return KEY_INVALID;
             }
             /* Separate ICMP connection: identified using id */
             key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
             key->src.icmp_type = icmp->icmp_type;
             key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
    -        break;
    +
    +        return KEY_OK;
    +
         case ICMP4_DST_UNREACH:
         case ICMP4_TIME_EXCEEDED:
         case ICMP4_PARAM_PROB:
    @@ -760,41 +790,40 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
             const char *l3 = (const char *) (icmp + 1);
             const char *tail = (const char *) data + size;
             const char *l4;
    +        enum key_status res;
             bool ok;
     
             if (!related) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             memset(&inner_key, 0, sizeof inner_key);
             inner_key.dl_type = htons(ETH_TYPE_IP);
             ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
             if (!ok) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             /* pf doesn't do this, but it seems a good idea */
             if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
                 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             key->src = inner_key.src;
             key->dst = inner_key.dst;
             key->nw_proto = inner_key.nw_proto;
     
    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);
    -        if (ok) {
    +        res = extract_l4(key, l4, tail - l4, NULL, l3);
    +        if (res != KEY_INVALID) {
                 conn_key_reverse(key);
                 *related = true;
             }
    -        return ok;
    +        return res;
         }
         default:
    -        return false;
    +        return KEY_INVALID;
         }
    -
    -    return true;
     }
     
     static uint8_t
    @@ -815,7 +844,7 @@ reverse_icmp6_type(uint8_t type)
      * instead and set *related to true.  If 'related' is NULL we're
      * already processing a nested header and no such recursion is
      * possible */
    -static inline bool
    +static inline enum key_status
     extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
                      bool *related)
     {
    @@ -824,20 +853,22 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
         /* All the messages that we support need at least 4 bytes after
          * the header */
         if (size < sizeof *icmp6 + 4) {
    -        return false;
    +        return KEY_INVALID;
         }
     
         switch (icmp6->icmp6_type) {
         case ICMP6_ECHO_REQUEST:
         case ICMP6_ECHO_REPLY:
             if (icmp6->icmp6_code != 0) {
    -            return false;
    +            return KEY_INVALID;
             }
             /* Separate ICMP connection: identified using id */
             key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
             key->src.icmp_type = icmp6->icmp6_type;
             key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
    -        break;
    +
    +        return KEY_OK;
    +
         case ICMP6_DST_UNREACH:
         case ICMP6_PACKET_TOO_BIG:
         case ICMP6_TIME_EXCEEDED:
    @@ -848,17 +879,18 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
             const char *l3 = (const char *) icmp6 + 8;
             const char *tail = (const char *) data + size;
             const char *l4 = NULL;
    +        enum key_status res;
             bool ok;
     
             if (!related) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             memset(&inner_key, 0, sizeof inner_key);
             inner_key.dl_type = htons(ETH_TYPE_IPV6);
             ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
             if (!ok) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             /* pf doesn't do this, but it seems a good idea */
    @@ -866,25 +898,32 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
                                   &key->dst.addr.ipv6_aligned)
                 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
                                      &key->src.addr.ipv6_aligned)) {
    -            return false;
    +            return KEY_INVALID;
             }
     
             key->src = inner_key.src;
             key->dst = inner_key.dst;
             key->nw_proto = inner_key.nw_proto;
     
    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);
    -        if (ok) {
    +        res = extract_l4(key, l4, tail - l4, NULL, l3);
    +        if (res != KEY_INVALID) {
                 conn_key_reverse(key);
                 *related = true;
             }
    -        return ok;
    +        return res;
         }
    +    case ND_NEIGHBOR_SOLICIT:
    +    case ND_NEIGHBOR_ADVERT:
    +    case ND_ROUTER_SOLICIT:
    +    case ND_ROUTER_ADVERT:

case ND_REDIRECT:

    +        if (icmp6->icmp6_code != 0) {
    +            return KEY_INVALID;
    +        }
    +        /* This packet is not going to be tracked. */
    +        return KEY_NO_TRACK;
         default:
    -        return false;
    +        return KEY_INVALID;
         }
    -
    -    return true;
     }
     
     /* Extract l4 fields into 'key', which must already contain valid l3
    @@ -896,30 +935,34 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
      *
      * If 'related' is NULL, it means that we're already parsing a header nested
      * in an ICMP error.  In this case, we skip checksum and length validation. */
    -static inline bool
    +static inline enum key_status
     extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
                const void *l3)
     {
         if (key->nw_proto == IPPROTO_TCP) {
             return (!related || check_l4_tcp(key, data, size, l3))
    -               && extract_l4_tcp(key, data, size);
    +               ? extract_l4_tcp(key, data, size)
    +               : KEY_INVALID;
         } else if (key->nw_proto == IPPROTO_UDP) {
             return (!related || check_l4_udp(key, data, size, l3))
    -               && extract_l4_udp(key, data, size);
    +               ? extract_l4_udp(key, data, size)
    +               : KEY_INVALID;
         } else if (key->dl_type == htons(ETH_TYPE_IP)
                    && key->nw_proto == IPPROTO_ICMP) {
             return (!related || check_l4_icmp(data, size))
    -               && extract_l4_icmp(key, data, size, related);
    +               ? extract_l4_icmp(key, data, size, related)
    +               : KEY_INVALID;
         } else if (key->dl_type == htons(ETH_TYPE_IPV6)
                    && key->nw_proto == IPPROTO_ICMPV6) {
             return (!related || check_l4_icmp6(key, data, size, l3))
    -               && extract_l4_icmp6(key, data, size, related);
    +               ? extract_l4_icmp6(key, data, size, related)
    +               : KEY_INVALID;
         } else {
    -        return false;
    +        return KEY_INVALID;
         }
     }
     
    -static bool
    +static enum key_status
     conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
                      struct conn_lookup_ctx *ctx, uint16_t zone)
     {
    @@ -932,7 +975,7 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
         memset(ctx, 0, sizeof *ctx);
     
         if (!l2 || !l3 || !l4) {
    -        return false;
    +        return KEY_INVALID;
         }
     
         ctx->key.zone = zone;
    @@ -979,13 +1022,16 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
         }
     
         if (ok) {
    -        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
    +        enum key_status res;
    +
    +        res = extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3);
    +        if (res == KEY_OK) {
                 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
    -            return true;
             }
    +        return res;
         }
     
    -    return false;
    +    return KEY_INVALID;
     }
     ?
     /* Symmetric */
    diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
    index 5f594be..5b70032 100644
    --- a/tests/ofproto-dpif.at
    +++ b/tests/ofproto-dpif.at
    @@ -8626,38 +8626,6 @@ OFPT_ECHO_REQUEST (xid=0x0): 0 bytes of payload
     OVS_VSWITCHD_STOP
     AT_CLEANUP
     
    -AT_SETUP([ofproto-dpif - conntrack - untrackable traffic])
    -OVS_VSWITCHD_START
    -
    -add_of_ports br0 1 2
    -
    -AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])
    -
    -AT_DATA([flows.txt], [dnl
    -ipv6,ct_state=-trk,action=ct(table=0,zone=0)
    -ct_state=+trk,action=controller
    -])
    -
    -AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
    -
    -AT_CAPTURE_FILE([ofctl_monitor.log])
    -AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach --no-chdir --pidfile 2> ofctl_monitor.log])
    -
    -AT_CHECK([ovs-appctl time/stop])
    -
    -AT_CHECK([ovs-appctl netdev-dummy/receive p2 '0060970769ea0000860580da86dd6000000000203afffe80000000000000020086fffe0580dafe80000000000000026097fffe0769ea870068bd00000000fe80000000000000026097fffe0769ea01010000860580da'])
    -
    -OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 1])
    -OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
    -
    -AT_CHECK([cat ofctl_monitor.log], [0], [dnl
    -NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=86 ct_state=inv|trk,in_port=2 (via action) data_len=86 (unbuffered)
    -icmp6,vlan_tci=0x0000,dl_src=00:00:86:05:80:da,dl_dst=00:60:97:07:69:ea,ipv6_src=fe80::200:86ff:fe05:80da,ipv6_dst=fe80::260:97ff:fe07:69ea,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fe80::260:97ff:fe07:69ea,nd_sll=00:00:86:05:80:da,nd_tll=00:00:00:00:00:00 icmp6_csum:68bd
    -])
    -
    -OVS_VSWITCHD_STOP
    -AT_CLEANUP
    -
     AT_SETUP([ofproto-dpif - conntrack - zones])
     OVS_VSWITCHD_START
     
    diff --git a/tests/system-traffic.at b/tests/system-traffic.at
    index a5023d3..7d86c81 100644
    --- a/tests/system-traffic.at
    +++ b/tests/system-traffic.at
    @@ -740,6 +740,41 @@ icmpv6,orig=(src=fc00::1,dst=fc00::2,id=<cleared>,type=128,code=0),reply=(src=fc
     OVS_TRAFFIC_VSWITCHD_STOP
     AT_CLEANUP
     
    +AT_SETUP([conntrack - neighbor discovery])
    +CHECK_CONNTRACK()
    +OVS_TRAFFIC_VSWITCHD_START()

Maybe create 2 ns and veths and then send packets b/w them
Maybe dump-flows to check the stats as well


    +
    +dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0.
    +AT_DATA([flows.txt], [dnl
    +table=0,priority=1,action=drop
    +table=0,priority=100,ip6,action=ct(table=1)
    +dnl
    +table=1,priority=100,ip6,action=controller
    +])
    +
    +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
    +
    +AT_CAPTURE_FILE([ofctl_monitor.log])
    +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
    +
    +dnl e6:6f:38:26:34:c4 > 33:33:ff:00:00:02, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::1 > ff02::1:ff00:2: [icmp6 sum ok] ICMP6, neighbor solicitation, length 32, who has fc00::2
    +dnl     source link-address option (1), length 8 (1): e6:6f:38:26:34:c4
    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' '3333ff000002e66f382634c486dd6000000000203afffc000000000000000000000000000001ff0200000000000000000001ff00000287002e3e00000000fc0000000000000000000000000000020101e66f382634c4'])
    +
    +dnl 56:5b:b3:68:8f:09 > e6:6f:38:26:34:c4, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::2 > fc00::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 32, tgt is fc00::2, Flags [solicited, override]
    +dnl     destination link-address option (2), length 8 (1): 56:5b:b3:68:8f:09
    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' 'e66f382634c4565bb3688f0986dd6000000000203afffc000000000000000000000000000002fc000000000000000000000000000001880088ce60000000fc0000000000000000000000000000020201565bb3688f09'])
    +
    +AT_CHECK([cat ofctl_monitor.log], [0], [dnl
    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action)
     data_len=86 (unbuffered)

+icmp6,vlan_tci=0x0000,dl_src=e6:6f:38:26:34:c4,dl_dst=33:33:ff:00:00:02,ipv6_src=fc00::1,ipv6_dst=ff02::1:ff00:2,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fc00::2,nd_sll=e6:6f:38:26:34:c4,nd_tll=00:00:00:00:00:00 icmp6_csum:2e3e
    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action) data_len=86 (unbuffered)
+icmp6,vlan_tci=0x0000,dl_src=56:5b:b3:68:8f:09,dl_dst=e6:6f:38:26:34:c4,ipv6_src=fc00::2,ipv6_dst=fc00::1,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=136,icmp_code=0,nd_target=fc00::2,nd_sll=00:00:00:00:00:00,nd_tll=56:5b:b3:68:8f:09 icmp6_csum:88ce
    +])
    +
    +OVS_TRAFFIC_VSWITCHD_STOP
    +AT_CLEANUP
    +
     AT_SETUP([conntrack - preserve registers])
     CHECK_CONNTRACK()
     OVS_TRAFFIC_VSWITCHD_START()
    -- 
    2.10.2
Daniele Di Proietto Dec. 24, 2016, 1:31 a.m. UTC | #2
On 22/12/2016 21:20, "Darrell Ball" <dball@vmware.com> wrote:

>Some comments inline

Thanks for the review, I've sent a v3

>
>On 12/22/16, 6:36 PM, "Daniele Di Proietto" <diproiettod@vmware.com> wrote:
>
>    The userspace connection tracker treats Neighbor Discovery packets
>    as invalid, because they're not checked against any connection.
>    
>    This in inconsistent with the kernel connection tracker which always
>    returns 'CS_NEW'.
>    
>    Therefore, this commit makes the userspace connection tracker conforming
>    with the kernel.  ND packets still do not create or read any state, but
>    they're treated as NEW.
>    
>    To support this, the key extraction functions can now return
>    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated
>    statelessly.
>
>
>s/    To support this, the key extraction functions can now return
>    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated
>    statelessly.
>/
>    To support this, the key extraction functions can now return
>    KEY_NO_TRACK, meaning the packet should be treated statelessly
>    and not be sent to the connection tracker.
>/

ok, changed

>
>
>    We also have to remove a test that explicitly checked that neighbor
>    discovery was treated as invalid.
>    
>    Reported-by: Sridhar Gaddam <sgaddam@redhat.com>
>    Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
>    ---
>    v2: Update comment to reflect that we do not do special validation with
>        the packet.
>    ---
>     lib/conntrack.c         | 134 ++++++++++++++++++++++++++++++++----------------
>     tests/ofproto-dpif.at   |  32 ------------
>     tests/system-traffic.at |  35 +++++++++++++
>     3 files changed, 125 insertions(+), 76 deletions(-)
>    
>    diff --git a/lib/conntrack.c b/lib/conntrack.c
>    index 9bea3d9..86228d6 100644
>    --- a/lib/conntrack.c
>    +++ b/lib/conntrack.c
>    @@ -52,9 +52,17 @@ struct conn_lookup_ctx {
>         bool related;
>     };
>     
>    -static bool conn_key_extract(struct conntrack *, struct dp_packet *,
>    -                             ovs_be16 dl_type, struct conn_lookup_ctx *,
>    -                             uint16_t zone);
>    +enum key_status {
>    +    KEY_INVALID,   /* Could not extract the connection key: invalid. */
>    +    KEY_OK,        /* Connection key is ok. */
>    +    KEY_NO_TRACK,  /* Connection key is ok, but it should not be tracked. */
>
>
>KEY_NO_TRACK,  /* Connection key should not be tracked. */

ok

>
>
>    +};
>    +
>    +static enum key_status conn_key_extract(struct conntrack *,
>    +                                        struct dp_packet *,
>    +                                        ovs_be16 dl_type,
>    +                                        struct conn_lookup_ctx *,
>    +                                        uint16_t zone);
>     static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
>     static void conn_key_reverse(struct conn_key *);
>     static void conn_key_lookup(struct conntrack_bucket *ctb,
>    @@ -157,6 +165,20 @@ static unsigned hash_to_bucket(uint32_t hash)
>         return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
>     }
>     
>    +static uint16_t
>    +key_status_to_cs(enum key_status s)
>    +{
>    +    switch (s) {
>    +    case KEY_INVALID:
>    +        return CS_INVALID;
>    +    case KEY_OK:
>    +    case KEY_NO_TRACK:
>    +        return CS_NEW;
>    +    default:
>    +        OVS_NOT_REACHED();
>    +    }
>    +}
>    +
>     static void
>     write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
>                 uint32_t mark, ovs_u128 label)
>    @@ -303,10 +325,13 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
>     
>         memset(bucket_list, INT8_C(-1), sizeof bucket_list);
>         for (i = 0; i < cnt; i++) {
>    +        enum key_status extract_res;
>             unsigned bucket;
>     
>    -        if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
>    -            write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
>    +        extract_res = conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone);
>    +        if (extract_res != KEY_OK) {
>    +            write_ct_md(pkts[i], key_status_to_cs(extract_res), zone, 0,
>    +                        OVS_U128_ZERO);
>                 continue;
>             }
>     
>    @@ -693,8 +718,11 @@ extract_l4_udp(struct conn_key *key, const void *data, size_t size)
>         return key->src.port && key->dst.port;
>     }
>     
>    -static inline bool extract_l4(struct conn_key *key, const void *data,
>    -                              size_t size, bool *related, const void *l3);
>    +static inline enum key_status extract_l4(struct conn_key *key,
>    +                                         const void *data,
>    +                                         size_t size,
>    +                                         bool *related,
>    +                                         const void *l3);
>     
>     static uint8_t
>     reverse_icmp_type(uint8_t type)
>    @@ -724,14 +752,14 @@ reverse_icmp_type(uint8_t type)
>      * instead and set *related to true.  If 'related' is NULL we're
>      * already processing a nested header and no such recursion is
>      * possible */
>    -static inline int
>    +static inline enum key_status
>     extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
>                     bool *related)
>     {
>         const struct icmp_header *icmp = data;
>     
>         if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
>    -        return false;
>    +        return KEY_INVALID;
>         }
>     
>         switch (icmp->icmp_type) {
>    @@ -742,13 +770,15 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
>         case ICMP4_INFOREQUEST:
>         case ICMP4_INFOREPLY:
>             if (icmp->icmp_code != 0) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>             /* Separate ICMP connection: identified using id */
>             key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
>             key->src.icmp_type = icmp->icmp_type;
>             key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
>    -        break;
>    +
>    +        return KEY_OK;
>    +
>         case ICMP4_DST_UNREACH:
>         case ICMP4_TIME_EXCEEDED:
>         case ICMP4_PARAM_PROB:
>    @@ -760,41 +790,40 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
>             const char *l3 = (const char *) (icmp + 1);
>             const char *tail = (const char *) data + size;
>             const char *l4;
>    +        enum key_status res;
>             bool ok;
>     
>             if (!related) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             memset(&inner_key, 0, sizeof inner_key);
>             inner_key.dl_type = htons(ETH_TYPE_IP);
>             ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
>             if (!ok) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             /* pf doesn't do this, but it seems a good idea */
>             if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
>                 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             key->src = inner_key.src;
>             key->dst = inner_key.dst;
>             key->nw_proto = inner_key.nw_proto;
>     
>    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);
>    -        if (ok) {
>    +        res = extract_l4(key, l4, tail - l4, NULL, l3);
>    +        if (res != KEY_INVALID) {
>                 conn_key_reverse(key);
>                 *related = true;
>             }
>    -        return ok;
>    +        return res;
>         }
>         default:
>    -        return false;
>    +        return KEY_INVALID;
>         }
>    -
>    -    return true;
>     }
>     
>     static uint8_t
>    @@ -815,7 +844,7 @@ reverse_icmp6_type(uint8_t type)
>      * instead and set *related to true.  If 'related' is NULL we're
>      * already processing a nested header and no such recursion is
>      * possible */
>    -static inline bool
>    +static inline enum key_status
>     extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>                      bool *related)
>     {
>    @@ -824,20 +853,22 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>         /* All the messages that we support need at least 4 bytes after
>          * the header */
>         if (size < sizeof *icmp6 + 4) {
>    -        return false;
>    +        return KEY_INVALID;
>         }
>     
>         switch (icmp6->icmp6_type) {
>         case ICMP6_ECHO_REQUEST:
>         case ICMP6_ECHO_REPLY:
>             if (icmp6->icmp6_code != 0) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>             /* Separate ICMP connection: identified using id */
>             key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
>             key->src.icmp_type = icmp6->icmp6_type;
>             key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
>    -        break;
>    +
>    +        return KEY_OK;
>    +
>         case ICMP6_DST_UNREACH:
>         case ICMP6_PACKET_TOO_BIG:
>         case ICMP6_TIME_EXCEEDED:
>    @@ -848,17 +879,18 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>             const char *l3 = (const char *) icmp6 + 8;
>             const char *tail = (const char *) data + size;
>             const char *l4 = NULL;
>    +        enum key_status res;
>             bool ok;
>     
>             if (!related) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             memset(&inner_key, 0, sizeof inner_key);
>             inner_key.dl_type = htons(ETH_TYPE_IPV6);
>             ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
>             if (!ok) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             /* pf doesn't do this, but it seems a good idea */
>    @@ -866,25 +898,32 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>                                   &key->dst.addr.ipv6_aligned)
>                 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
>                                      &key->src.addr.ipv6_aligned)) {
>    -            return false;
>    +            return KEY_INVALID;
>             }
>     
>             key->src = inner_key.src;
>             key->dst = inner_key.dst;
>             key->nw_proto = inner_key.nw_proto;
>     
>    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);
>    -        if (ok) {
>    +        res = extract_l4(key, l4, tail - l4, NULL, l3);
>    +        if (res != KEY_INVALID) {
>                 conn_key_reverse(key);
>                 *related = true;
>             }
>    -        return ok;
>    +        return res;
>         }
>    +    case ND_NEIGHBOR_SOLICIT:
>    +    case ND_NEIGHBOR_ADVERT:
>    +    case ND_ROUTER_SOLICIT:
>    +    case ND_ROUTER_ADVERT:
>
>case ND_REDIRECT:

I created a testcase with some ND REDIRECTS and apparently the kernel connection tracker rejects them.

Here's the testcase:

---8<---
AT_SETUP([datapath - ipv6 router with redirect])
CHECK_CONNTRACK()
CHECK_CONNTRACK_FRAG()
OVS_TRAFFIC_VSWITCHD_START()

ADD_NAMESPACES(at_ns1, at_ns2, at_ns3)

ADD_VETH(p1, at_ns1, br0, "fc00::1/96")
ADD_VETH(p2, at_ns2, br0, "fc00::2/96")
ADD_VETH(p3, at_ns3, br0, "fc00::3/96")

dnl Sending ping through conntrack
AT_DATA([flows.txt], [dnl
table=0,priority=100,ip,action=ct(table=1)
table=0,priority=100,ip6,action=ct(table=1)
table=0,priority=10,action=resubmit(,1)
dnl
table=1,priority=100,action=normal,resubmit(,2)
dnl
table=2,priority=100,icmp6,icmp_type=137,action=controller
])

AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])

dnl Linux seems to take a little time to get its IPv6 stack in order. Without
dnl waiting, we get occasional failures due to the following error:
dnl "connect: Cannot assign requested address"
OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 fc00::2])
OVS_WAIT_UNTIL([ip netns exec at_ns2 ping6 -c 1 fc00::3])

rm p?.pcap
tcpdump -U -i ovs-p1 -w p1.pcap &
tcpdump -U -i ovs-p2 -w p2.pcap &
tcpdump -U -i ovs-p3 -w p3.pcap &

AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])

NS_CHECK_EXEC([at_ns2], [sysctl -w net.ipv6.conf.all.forwarding=1], [0], [dnl
net.ipv6.conf.all.forwarding = 1
])
NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00::3/128 via fc00::2 metric 1])
NS_EXEC([at_ns1], [ip -6 route show])

NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl
3 packets transmitted, 3 received, 0% packet loss, time 0ms
])

sleep 2

OVS_TRAFFIC_VSWITCHD_STOP
AT_CLEANUP

---8<---



When I execute it I find a ND_REDIRECT packet in ofctl_monitor.log marked as invalid.

You can also see what happens by looking at the pcap files.


>
>    +        if (icmp6->icmp6_code != 0) {
>    +            return KEY_INVALID;
>    +        }
>    +        /* This packet is not going to be tracked. */
>    +        return KEY_NO_TRACK;
>         default:
>    -        return false;
>    +        return KEY_INVALID;
>         }
>    -
>    -    return true;
>     }
>     
>     /* Extract l4 fields into 'key', which must already contain valid l3
>    @@ -896,30 +935,34 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>      *
>      * If 'related' is NULL, it means that we're already parsing a header nested
>      * in an ICMP error.  In this case, we skip checksum and length validation. */
>    -static inline bool
>    +static inline enum key_status
>     extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
>                const void *l3)
>     {
>         if (key->nw_proto == IPPROTO_TCP) {
>             return (!related || check_l4_tcp(key, data, size, l3))
>    -               && extract_l4_tcp(key, data, size);
>    +               ? extract_l4_tcp(key, data, size)
>    +               : KEY_INVALID;
>         } else if (key->nw_proto == IPPROTO_UDP) {
>             return (!related || check_l4_udp(key, data, size, l3))
>    -               && extract_l4_udp(key, data, size);
>    +               ? extract_l4_udp(key, data, size)
>    +               : KEY_INVALID;
>         } else if (key->dl_type == htons(ETH_TYPE_IP)
>                    && key->nw_proto == IPPROTO_ICMP) {
>             return (!related || check_l4_icmp(data, size))
>    -               && extract_l4_icmp(key, data, size, related);
>    +               ? extract_l4_icmp(key, data, size, related)
>    +               : KEY_INVALID;
>         } else if (key->dl_type == htons(ETH_TYPE_IPV6)
>                    && key->nw_proto == IPPROTO_ICMPV6) {
>             return (!related || check_l4_icmp6(key, data, size, l3))
>    -               && extract_l4_icmp6(key, data, size, related);
>    +               ? extract_l4_icmp6(key, data, size, related)
>    +               : KEY_INVALID;
>         } else {
>    -        return false;
>    +        return KEY_INVALID;
>         }
>     }
>     
>    -static bool
>    +static enum key_status
>     conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
>                      struct conn_lookup_ctx *ctx, uint16_t zone)
>     {
>    @@ -932,7 +975,7 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
>         memset(ctx, 0, sizeof *ctx);
>     
>         if (!l2 || !l3 || !l4) {
>    -        return false;
>    +        return KEY_INVALID;
>         }
>     
>         ctx->key.zone = zone;
>    @@ -979,13 +1022,16 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
>         }
>     
>         if (ok) {
>    -        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
>    +        enum key_status res;
>    +
>    +        res = extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3);
>    +        if (res == KEY_OK) {
>                 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
>    -            return true;
>             }
>    +        return res;
>         }
>     
>    -    return false;
>    +    return KEY_INVALID;
>     }
>     ?
>     /* Symmetric */
>    diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
>    index 5f594be..5b70032 100644
>    --- a/tests/ofproto-dpif.at
>    +++ b/tests/ofproto-dpif.at
>    @@ -8626,38 +8626,6 @@ OFPT_ECHO_REQUEST (xid=0x0): 0 bytes of payload
>     OVS_VSWITCHD_STOP
>     AT_CLEANUP
>     
>    -AT_SETUP([ofproto-dpif - conntrack - untrackable traffic])
>    -OVS_VSWITCHD_START
>    -
>    -add_of_ports br0 1 2
>    -
>    -AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])
>    -
>    -AT_DATA([flows.txt], [dnl
>    -ipv6,ct_state=-trk,action=ct(table=0,zone=0)
>    -ct_state=+trk,action=controller
>    -])
>    -
>    -AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
>    -
>    -AT_CAPTURE_FILE([ofctl_monitor.log])
>    -AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach --no-chdir --pidfile 2> ofctl_monitor.log])
>    -
>    -AT_CHECK([ovs-appctl time/stop])
>    -
>    -AT_CHECK([ovs-appctl netdev-dummy/receive p2 '0060970769ea0000860580da86dd6000000000203afffe80000000000000020086fffe0580dafe80000000000000026097fffe0769ea870068bd00000000fe80000000000000026097fffe0769ea01010000860580da'])
>    -
>    -OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 1])
>    -OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
>    -
>    -AT_CHECK([cat ofctl_monitor.log], [0], [dnl
>    -NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=86 ct_state=inv|trk,in_port=2 (via action) data_len=86 (unbuffered)
>    -icmp6,vlan_tci=0x0000,dl_src=00:00:86:05:80:da,dl_dst=00:60:97:07:69:ea,ipv6_src=fe80::200:86ff:fe05:80da,ipv6_dst=fe80::260:97ff:fe07:69ea,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fe80::260:97ff:fe07:69ea,nd_sll=00:00:86:05:80:da,nd_tll=00:00:00:00:00:00 icmp6_csum:68bd
>    -])
>    -
>    -OVS_VSWITCHD_STOP
>    -AT_CLEANUP
>    -
>     AT_SETUP([ofproto-dpif - conntrack - zones])
>     OVS_VSWITCHD_START
>     
>    diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>    index a5023d3..7d86c81 100644
>    --- a/tests/system-traffic.at
>    +++ b/tests/system-traffic.at
>    @@ -740,6 +740,41 @@ icmpv6,orig=(src=fc00::1,dst=fc00::2,id=<cleared>,type=128,code=0),reply=(src=fc
>     OVS_TRAFFIC_VSWITCHD_STOP
>     AT_CLEANUP
>     
>    +AT_SETUP([conntrack - neighbor discovery])
>    +CHECK_CONNTRACK()
>    +OVS_TRAFFIC_VSWITCHD_START()
>
>Maybe create 2 ns and veths and then send packets b/w them

I was reluctant to do that because it's more complicated, but since you think it's a good idea I've added a second part to the testcase where we send traffic between veth pairs.

>Maybe dump-flows to check the stats as well

With veth pairs it's going to be hard to check the stats, there are too many things not under out control.

Without veth pairs we only send two packets, they all hit the same flows and we check all of them at the controller, so I'm not sure it really adds value. 

>
>
>    +
>    +dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0.
>    +AT_DATA([flows.txt], [dnl
>    +table=0,priority=1,action=drop
>    +table=0,priority=100,ip6,action=ct(table=1)
>    +dnl
>    +table=1,priority=100,ip6,action=controller
>    +])
>    +
>    +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>    +
>    +AT_CAPTURE_FILE([ofctl_monitor.log])
>    +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
>    +
>    +dnl e6:6f:38:26:34:c4 > 33:33:ff:00:00:02, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::1 > ff02::1:ff00:2: [icmp6 sum ok] ICMP6, neighbor solicitation, length 32, who has fc00::2
>    +dnl     source link-address option (1), length 8 (1): e6:6f:38:26:34:c4
>    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' '3333ff000002e66f382634c486dd6000000000203afffc000000000000000000000000000001ff0200000000000000000001ff00000287002e3e00000000fc0000000000000000000000000000020101e66f382634c4'])
>    +
>    +dnl 56:5b:b3:68:8f:09 > e6:6f:38:26:34:c4, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::2 > fc00::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 32, tgt is fc00::2, Flags [solicited, override]
>    +dnl     destination link-address option (2), length 8 (1): 56:5b:b3:68:8f:09
>    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' 'e66f382634c4565bb3688f0986dd6000000000203afffc000000000000000000000000000002fc000000000000000000000000000001880088ce60000000fc0000000000000000000000000000020201565bb3688f09'])
>    +
>    +AT_CHECK([cat ofctl_monitor.log], [0], [dnl
>    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action)
>     data_len=86 (unbuffered)
>
>+icmp6,vlan_tci=0x0000,dl_src=e6:6f:38:26:34:c4,dl_dst=33:33:ff:00:00:02,ipv6_src=fc00::1,ipv6_dst=ff02::1:ff00:2,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fc00::2,nd_sll=e6:6f:38:26:34:c4,nd_tll=00:00:00:00:00:00 icmp6_csum:2e3e
>    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action) data_len=86 (unbuffered)
>+icmp6,vlan_tci=0x0000,dl_src=56:5b:b3:68:8f:09,dl_dst=e6:6f:38:26:34:c4,ipv6_src=fc00::2,ipv6_dst=fc00::1,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=136,icmp_code=0,nd_target=fc00::2,nd_sll=00:00:00:00:00:00,nd_tll=56:5b:b3:68:8f:09 icmp6_csum:88ce
>    +])
>    +
>    +OVS_TRAFFIC_VSWITCHD_STOP
>    +AT_CLEANUP
>    +
>     AT_SETUP([conntrack - preserve registers])
>     CHECK_CONNTRACK()
>     OVS_TRAFFIC_VSWITCHD_START()
>    -- 
>    2.10.2
>    
>    
>
Darrell Ball Jan. 3, 2017, 10:10 p.m. UTC | #3
On 12/23/16, 5:31 PM, "Daniele Di Proietto" <diproiettod@vmware.com> wrote:

    
    
    
    
    
    On 22/12/2016 21:20, "Darrell Ball" <dball@vmware.com> wrote:
    
    >Some comments inline

    
    Thanks for the review, I've sent a v3
    
    >

    >On 12/22/16, 6:36 PM, "Daniele Di Proietto" <diproiettod@vmware.com> wrote:

    >

    >    The userspace connection tracker treats Neighbor Discovery packets

    >    as invalid, because they're not checked against any connection.

    >    

    >    This in inconsistent with the kernel connection tracker which always

    >    returns 'CS_NEW'.

    >    

    >    Therefore, this commit makes the userspace connection tracker conforming

    >    with the kernel.  ND packets still do not create or read any state, but

    >    they're treated as NEW.

    >    

    >    To support this, the key extraction functions can now return

    >    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated

    >    statelessly.

    >

    >

    >s/    To support this, the key extraction functions can now return

    >    KEY_NO_TRACK, meaning that the packet is ok, but it should be treated

    >    statelessly.

    >/

    >    To support this, the key extraction functions can now return

    >    KEY_NO_TRACK, meaning the packet should be treated statelessly

    >    and not be sent to the connection tracker.

    >/

    
    ok, changed
    
    >

    >

    >    We also have to remove a test that explicitly checked that neighbor

    >    discovery was treated as invalid.

    >    

    >    Reported-by: Sridhar Gaddam <sgaddam@redhat.com>

    >    Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

    >    ---

    >    v2: Update comment to reflect that we do not do special validation with

    >        the packet.

    >    ---

    >     lib/conntrack.c         | 134 ++++++++++++++++++++++++++++++++----------------

    >     tests/ofproto-dpif.at   |  32 ------------

    >     tests/system-traffic.at |  35 +++++++++++++

    >     3 files changed, 125 insertions(+), 76 deletions(-)

    >    

    >    diff --git a/lib/conntrack.c b/lib/conntrack.c

    >    index 9bea3d9..86228d6 100644

    >    --- a/lib/conntrack.c

    >    +++ b/lib/conntrack.c

    >    @@ -52,9 +52,17 @@ struct conn_lookup_ctx {

    >         bool related;

    >     };

    >     

    >    -static bool conn_key_extract(struct conntrack *, struct dp_packet *,

    >    -                             ovs_be16 dl_type, struct conn_lookup_ctx *,

    >    -                             uint16_t zone);

    >    +enum key_status {

    >    +    KEY_INVALID,   /* Could not extract the connection key: invalid. */

    >    +    KEY_OK,        /* Connection key is ok. */

    >    +    KEY_NO_TRACK,  /* Connection key is ok, but it should not be tracked. */

    >

    >

    >KEY_NO_TRACK,  /* Connection key should not be tracked. */

    
    ok
    
    >

    >

    >    +};

    >    +

    >    +static enum key_status conn_key_extract(struct conntrack *,

    >    +                                        struct dp_packet *,

    >    +                                        ovs_be16 dl_type,

    >    +                                        struct conn_lookup_ctx *,

    >    +                                        uint16_t zone);

    >     static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);

    >     static void conn_key_reverse(struct conn_key *);

    >     static void conn_key_lookup(struct conntrack_bucket *ctb,

    >    @@ -157,6 +165,20 @@ static unsigned hash_to_bucket(uint32_t hash)

    >         return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;

    >     }

    >     

    >    +static uint16_t

    >    +key_status_to_cs(enum key_status s)

    >    +{

    >    +    switch (s) {

    >    +    case KEY_INVALID:

    >    +        return CS_INVALID;

    >    +    case KEY_OK:

    >    +    case KEY_NO_TRACK:

    >    +        return CS_NEW;

    >    +    default:

    >    +        OVS_NOT_REACHED();

    >    +    }

    >    +}

    >    +

    >     static void

    >     write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,

    >                 uint32_t mark, ovs_u128 label)

    >    @@ -303,10 +325,13 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,

    >     

    >         memset(bucket_list, INT8_C(-1), sizeof bucket_list);

    >         for (i = 0; i < cnt; i++) {

    >    +        enum key_status extract_res;

    >             unsigned bucket;

    >     

    >    -        if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {

    >    -            write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);

    >    +        extract_res = conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone);

    >    +        if (extract_res != KEY_OK) {

    >    +            write_ct_md(pkts[i], key_status_to_cs(extract_res), zone, 0,

    >    +                        OVS_U128_ZERO);

    >                 continue;

    >             }

    >     

    >    @@ -693,8 +718,11 @@ extract_l4_udp(struct conn_key *key, const void *data, size_t size)

    >         return key->src.port && key->dst.port;

    >     }

    >     

    >    -static inline bool extract_l4(struct conn_key *key, const void *data,

    >    -                              size_t size, bool *related, const void *l3);

    >    +static inline enum key_status extract_l4(struct conn_key *key,

    >    +                                         const void *data,

    >    +                                         size_t size,

    >    +                                         bool *related,

    >    +                                         const void *l3);

    >     

    >     static uint8_t

    >     reverse_icmp_type(uint8_t type)

    >    @@ -724,14 +752,14 @@ reverse_icmp_type(uint8_t type)

    >      * instead and set *related to true.  If 'related' is NULL we're

    >      * already processing a nested header and no such recursion is

    >      * possible */

    >    -static inline int

    >    +static inline enum key_status

    >     extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

    >                     bool *related)

    >     {

    >         const struct icmp_header *icmp = data;

    >     

    >         if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >     

    >         switch (icmp->icmp_type) {

    >    @@ -742,13 +770,15 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

    >         case ICMP4_INFOREQUEST:

    >         case ICMP4_INFOREPLY:

    >             if (icmp->icmp_code != 0) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >             /* Separate ICMP connection: identified using id */

    >             key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;

    >             key->src.icmp_type = icmp->icmp_type;

    >             key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);

    >    -        break;

    >    +

    >    +        return KEY_OK;

    >    +

    >         case ICMP4_DST_UNREACH:

    >         case ICMP4_TIME_EXCEEDED:

    >         case ICMP4_PARAM_PROB:

    >    @@ -760,41 +790,40 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,

    >             const char *l3 = (const char *) (icmp + 1);

    >             const char *tail = (const char *) data + size;

    >             const char *l4;

    >    +        enum key_status res;

    >             bool ok;

    >     

    >             if (!related) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             memset(&inner_key, 0, sizeof inner_key);

    >             inner_key.dl_type = htons(ETH_TYPE_IP);

    >             ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);

    >             if (!ok) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             /* pf doesn't do this, but it seems a good idea */

    >             if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned

    >                 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             key->src = inner_key.src;

    >             key->dst = inner_key.dst;

    >             key->nw_proto = inner_key.nw_proto;

    >     

    >    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);

    >    -        if (ok) {

    >    +        res = extract_l4(key, l4, tail - l4, NULL, l3);

    >    +        if (res != KEY_INVALID) {

    >                 conn_key_reverse(key);

    >                 *related = true;

    >             }

    >    -        return ok;

    >    +        return res;

    >         }

    >         default:

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >    -

    >    -    return true;

    >     }

    >     

    >     static uint8_t

    >    @@ -815,7 +844,7 @@ reverse_icmp6_type(uint8_t type)

    >      * instead and set *related to true.  If 'related' is NULL we're

    >      * already processing a nested header and no such recursion is

    >      * possible */

    >    -static inline bool

    >    +static inline enum key_status

    >     extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

    >                      bool *related)

    >     {

    >    @@ -824,20 +853,22 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

    >         /* All the messages that we support need at least 4 bytes after

    >          * the header */

    >         if (size < sizeof *icmp6 + 4) {

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >     

    >         switch (icmp6->icmp6_type) {

    >         case ICMP6_ECHO_REQUEST:

    >         case ICMP6_ECHO_REPLY:

    >             if (icmp6->icmp6_code != 0) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >             /* Separate ICMP connection: identified using id */

    >             key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);

    >             key->src.icmp_type = icmp6->icmp6_type;

    >             key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);

    >    -        break;

    >    +

    >    +        return KEY_OK;

    >    +

    >         case ICMP6_DST_UNREACH:

    >         case ICMP6_PACKET_TOO_BIG:

    >         case ICMP6_TIME_EXCEEDED:

    >    @@ -848,17 +879,18 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

    >             const char *l3 = (const char *) icmp6 + 8;

    >             const char *tail = (const char *) data + size;

    >             const char *l4 = NULL;

    >    +        enum key_status res;

    >             bool ok;

    >     

    >             if (!related) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             memset(&inner_key, 0, sizeof inner_key);

    >             inner_key.dl_type = htons(ETH_TYPE_IPV6);

    >             ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);

    >             if (!ok) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             /* pf doesn't do this, but it seems a good idea */

    >    @@ -866,25 +898,32 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

    >                                   &key->dst.addr.ipv6_aligned)

    >                 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,

    >                                      &key->src.addr.ipv6_aligned)) {

    >    -            return false;

    >    +            return KEY_INVALID;

    >             }

    >     

    >             key->src = inner_key.src;

    >             key->dst = inner_key.dst;

    >             key->nw_proto = inner_key.nw_proto;

    >     

    >    -        ok = extract_l4(key, l4, tail - l4, NULL, l3);

    >    -        if (ok) {

    >    +        res = extract_l4(key, l4, tail - l4, NULL, l3);

    >    +        if (res != KEY_INVALID) {

    >                 conn_key_reverse(key);

    >                 *related = true;

    >             }

    >    -        return ok;

    >    +        return res;

    >         }

    >    +    case ND_NEIGHBOR_SOLICIT:

    >    +    case ND_NEIGHBOR_ADVERT:

    >    +    case ND_ROUTER_SOLICIT:

    >    +    case ND_ROUTER_ADVERT:

    >

    >case ND_REDIRECT:

    
    I created a testcase with some ND REDIRECTS and apparently the kernel connection tracker rejects them.
    
    Here's the testcase:
    
    ---8<---
    AT_SETUP([datapath - ipv6 router with redirect])
    CHECK_CONNTRACK()
    CHECK_CONNTRACK_FRAG()
    OVS_TRAFFIC_VSWITCHD_START()
    
    ADD_NAMESPACES(at_ns1, at_ns2, at_ns3)
    
    ADD_VETH(p1, at_ns1, br0, "fc00::1/96")
    ADD_VETH(p2, at_ns2, br0, "fc00::2/96")
    ADD_VETH(p3, at_ns3, br0, "fc00::3/96")
    
    dnl Sending ping through conntrack
    AT_DATA([flows.txt], [dnl
    table=0,priority=100,ip,action=ct(table=1)
    table=0,priority=100,ip6,action=ct(table=1)
    table=0,priority=10,action=resubmit(,1)
    dnl
    table=1,priority=100,action=normal,resubmit(,2)
    dnl
    table=2,priority=100,icmp6,icmp_type=137,action=controller
    ])
    
    AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
    
    dnl Linux seems to take a little time to get its IPv6 stack in order. Without
    dnl waiting, we get occasional failures due to the following error:
    dnl "connect: Cannot assign requested address"
    OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 fc00::2])
    OVS_WAIT_UNTIL([ip netns exec at_ns2 ping6 -c 1 fc00::3])
    
    rm p?.pcap
    tcpdump -U -i ovs-p1 -w p1.pcap &
    tcpdump -U -i ovs-p2 -w p2.pcap &
    tcpdump -U -i ovs-p3 -w p3.pcap &
    
    AT_CAPTURE_FILE([ofctl_monitor.log])
    AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
    
    NS_CHECK_EXEC([at_ns2], [sysctl -w net.ipv6.conf.all.forwarding=1], [0], [dnl
    net.ipv6.conf.all.forwarding = 1
    ])
    NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00::3/128 via fc00::2 metric 1])
    NS_EXEC([at_ns1], [ip -6 route show])
    
    NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl
    3 packets transmitted, 3 received, 0% packet loss, time 0ms
    ])
    
    sleep 2
    
    OVS_TRAFFIC_VSWITCHD_STOP
    AT_CLEANUP
    
    ---8<---
    
    
    
    When I execute it I find a ND_REDIRECT packet in ofctl_monitor.log marked as invalid.
    
    You can also see what happens by looking at the pcap files.


The new test is fine; I think we need a test for redirect here to delineate the
baseline behavior,

I tried some other tests; one test, albeit a contrived one, is below.
The baseline redirect association is not being handled and the
redirect is labelled as INV.

However, redirects have additional exploit potential and there would be a
need to disable their acceptance, which can be done using stateless 
protection.
Maybe we could add an enhancement later to handle the redirect
association to be used in combination with stateless protection.

AT_SETUP([datapath - ipv6 router with redirect 1])
CHECK_CONNTRACK()
CHECK_CONNTRACK_FRAG()
OVS_TRAFFIC_VSWITCHD_START()

ADD_NAMESPACES(at_ns1, at_ns2, at_ns3)

ADD_VETH(p1, at_ns1, br0, "fe80::1/10")
ADD_VETH(p2, at_ns2, br0, "fe80::2/10")
ADD_VETH(p3, at_ns3, br0, "fe80::3/10")

dnl Sending ping through conntrack
AT_DATA([flows.txt], [dnl
table=0,priority=100,ip6,action=ct(commit, table=1)
dnl
table=1,priority=100,action=normal,resubmit(,2)
dnl
table=2,priority=100,icmp6,icmp_type=137,action=controller
dnl
table=2,priority=100,icmp6,icmp_type=128,action=controller
dnl
table=2,priority=100,icmp6,icmp_type=129,action=controller
])

AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])

NS_CHECK_EXEC([at_ns1], [sysctl -w net.ipv6.conf.all.forwarding=1], [0], [dnl
net.ipv6.conf.all.forwarding = 1
])
NS_CHECK_EXEC([at_ns2], [sysctl -w net.ipv6.conf.all.forwarding=1], [0], [dnl
net.ipv6.conf.all.forwarding = 1
])
NS_CHECK_EXEC([at_ns3], [sysctl -w net.ipv6.conf.all.forwarding=1], [0], [dnl
net.ipv6.conf.all.forwarding = 1
])

OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 -I p1 fe80::2])

AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 --detach --no-chdir --pidfile 2> ofctl_monitor.log])

rm p2.pcap
tcpdump -i ovs-p2 -w p2.pcap &

NS_CHECK_EXEC([at_ns1], [ip -6 route add fe80::3 via fe80::2 dev p1 metric 1])

NS_CHECK_EXEC([at_ns1], [ping6 -q -c 1 -i 0.3 -w 2 -I p1 fe80::3 | FORMAT_PING], [0], [dnl
1 packets transmitted, 1 received, 0% packet loss, time 0ms
])

sleep 2

OVS_TRAFFIC_VSWITCHD_STOP
AT_CLEANUP 


    
    
    >

    >    +        if (icmp6->icmp6_code != 0) {

    >    +            return KEY_INVALID;

    >    +        }

    >    +        /* This packet is not going to be tracked. */

    >    +        return KEY_NO_TRACK;

    >         default:

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >    -

    >    -    return true;

    >     }

    >     

    >     /* Extract l4 fields into 'key', which must already contain valid l3

    >    @@ -896,30 +935,34 @@ extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,

    >      *

    >      * If 'related' is NULL, it means that we're already parsing a header nested

    >      * in an ICMP error.  In this case, we skip checksum and length validation. */

    >    -static inline bool

    >    +static inline enum key_status

    >     extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,

    >                const void *l3)

    >     {

    >         if (key->nw_proto == IPPROTO_TCP) {

    >             return (!related || check_l4_tcp(key, data, size, l3))

    >    -               && extract_l4_tcp(key, data, size);

    >    +               ? extract_l4_tcp(key, data, size)

    >    +               : KEY_INVALID;

    >         } else if (key->nw_proto == IPPROTO_UDP) {

    >             return (!related || check_l4_udp(key, data, size, l3))

    >    -               && extract_l4_udp(key, data, size);

    >    +               ? extract_l4_udp(key, data, size)

    >    +               : KEY_INVALID;

    >         } else if (key->dl_type == htons(ETH_TYPE_IP)

    >                    && key->nw_proto == IPPROTO_ICMP) {

    >             return (!related || check_l4_icmp(data, size))

    >    -               && extract_l4_icmp(key, data, size, related);

    >    +               ? extract_l4_icmp(key, data, size, related)

    >    +               : KEY_INVALID;

    >         } else if (key->dl_type == htons(ETH_TYPE_IPV6)

    >                    && key->nw_proto == IPPROTO_ICMPV6) {

    >             return (!related || check_l4_icmp6(key, data, size, l3))

    >    -               && extract_l4_icmp6(key, data, size, related);

    >    +               ? extract_l4_icmp6(key, data, size, related)

    >    +               : KEY_INVALID;

    >         } else {

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >     }

    >     

    >    -static bool

    >    +static enum key_status

    >     conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,

    >                      struct conn_lookup_ctx *ctx, uint16_t zone)

    >     {

    >    @@ -932,7 +975,7 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,

    >         memset(ctx, 0, sizeof *ctx);

    >     

    >         if (!l2 || !l3 || !l4) {

    >    -        return false;

    >    +        return KEY_INVALID;

    >         }

    >     

    >         ctx->key.zone = zone;

    >    @@ -979,13 +1022,16 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,

    >         }

    >     

    >         if (ok) {

    >    -        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {

    >    +        enum key_status res;

    >    +

    >    +        res = extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3);

    >    +        if (res == KEY_OK) {

    >                 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);

    >    -            return true;

    >             }

    >    +        return res;

    >         }

    >     

    >    -    return false;

    >    +    return KEY_INVALID;

    >     }

    >     ?

    >     /* Symmetric */

    >    diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at

    >    index 5f594be..5b70032 100644

    >    --- a/tests/ofproto-dpif.at

    >    +++ b/tests/ofproto-dpif.at

    >    @@ -8626,38 +8626,6 @@ OFPT_ECHO_REQUEST (xid=0x0): 0 bytes of payload

    >     OVS_VSWITCHD_STOP

    >     AT_CLEANUP

    >     

    >    -AT_SETUP([ofproto-dpif - conntrack - untrackable traffic])

    >    -OVS_VSWITCHD_START

    >    -

    >    -add_of_ports br0 1 2

    >    -

    >    -AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])

    >    -

    >    -AT_DATA([flows.txt], [dnl

    >    -ipv6,ct_state=-trk,action=ct(table=0,zone=0)

    >    -ct_state=+trk,action=controller

    >    -])

    >    -

    >    -AT_CHECK([ovs-ofctl add-flows br0 flows.txt])

    >    -

    >    -AT_CAPTURE_FILE([ofctl_monitor.log])

    >    -AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach --no-chdir --pidfile 2> ofctl_monitor.log])

    >    -

    >    -AT_CHECK([ovs-appctl time/stop])

    >    -

    >    -AT_CHECK([ovs-appctl netdev-dummy/receive p2 '0060970769ea0000860580da86dd6000000000203afffe80000000000000020086fffe0580dafe80000000000000026097fffe0769ea870068bd00000000fe80000000000000026097fffe0769ea01010000860580da'])

    >    -

    >    -OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 1])

    >    -OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])

    >    -

    >    -AT_CHECK([cat ofctl_monitor.log], [0], [dnl

    >    -NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=86 ct_state=inv|trk,in_port=2 (via action) data_len=86 (unbuffered)

    >    -icmp6,vlan_tci=0x0000,dl_src=00:00:86:05:80:da,dl_dst=00:60:97:07:69:ea,ipv6_src=fe80::200:86ff:fe05:80da,ipv6_dst=fe80::260:97ff:fe07:69ea,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fe80::260:97ff:fe07:69ea,nd_sll=00:00:86:05:80:da,nd_tll=00:00:00:00:00:00 icmp6_csum:68bd

    >    -])

    >    -

    >    -OVS_VSWITCHD_STOP

    >    -AT_CLEANUP

    >    -

    >     AT_SETUP([ofproto-dpif - conntrack - zones])

    >     OVS_VSWITCHD_START

    >     

    >    diff --git a/tests/system-traffic.at b/tests/system-traffic.at

    >    index a5023d3..7d86c81 100644

    >    --- a/tests/system-traffic.at

    >    +++ b/tests/system-traffic.at

    >    @@ -740,6 +740,41 @@ icmpv6,orig=(src=fc00::1,dst=fc00::2,id=<cleared>,type=128,code=0),reply=(src=fc

    >     OVS_TRAFFIC_VSWITCHD_STOP

    >     AT_CLEANUP

    >     

    >    +AT_SETUP([conntrack - neighbor discovery])

    >    +CHECK_CONNTRACK()

    >    +OVS_TRAFFIC_VSWITCHD_START()

    >

    >Maybe create 2 ns and veths and then send packets b/w them

    
    I was reluctant to do that because it's more complicated, but since you think it's a good idea I've added a
     second
    part to the testcase where we send traffic between veth pairs.

    
    >Maybe dump-flows to check the stats as well

    
    With veth pairs it's going to be hard to check the stats, there are too many things not under out control.
    
    Without veth pairs we only send two packets, they all hit the same flows and we check all of them at the
     controller,
     so I'm not sure it really adds value. 

That’s fine.
    
    >

    >

    >    +

    >    +dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0.

    >    +AT_DATA([flows.txt], [dnl

    >    +table=0,priority=1,action=drop

    >    +table=0,priority=100,ip6,action=ct(table=1)

    >    +dnl

    >    +table=1,priority=100,ip6,action=controller

    >    +])

    >    +

    >    +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])

    >    +

    >    +AT_CAPTURE_FILE([ofctl_monitor.log])

    >    +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])

    >    +

    >    +dnl e6:6f:38:26:34:c4 > 33:33:ff:00:00:02, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::1 > ff02::1:ff00:2: [icmp6 sum ok] ICMP6, neighbor solicitation, length 32, who has fc00::2

    >    +dnl     source link-address option (1), length 8 (1): e6:6f:38:26:34:c4

    >    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' '3333ff000002e66f382634c486dd6000000000203afffc000000000000000000000000000001ff0200000000000000000001ff00000287002e3e00000000fc0000000000000000000000000000020101e66f382634c4'])

    >    +

    >    +dnl 56:5b:b3:68:8f:09 > e6:6f:38:26:34:c4, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::2 > fc00::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 32, tgt is fc00::2, Flags [solicited, override]

    >    +dnl     destination link-address option (2), length 8 (1): 56:5b:b3:68:8f:09

    >    +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' 'e66f382634c4565bb3688f0986dd6000000000203afffc000000000000000000000000000002fc000000000000000000000000000001880088ce60000000fc0000000000000000000000000000020201565bb3688f09'])

    >    +

    >    +AT_CHECK([cat ofctl_monitor.log], [0], [dnl

    >    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action)

    >     data_len=86 (unbuffered)

    >

    

>+icmp6,vlan_tci=0x0000,dl_src=e6:6f:38:26:34:c4,dl_dst=33:33:ff:00:00:02,ipv6_src=fc00::1,ipv6_dst=ff02::1:ff00:2,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fc00::2,nd_sll=e6:6f:38:26:34:c4,nd_tll=00:00:00:00:00:00 icmp6_csum:2e3e

    >    +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action) data_len=86 (unbuffered)


The baseline behavior looks “non-ideal”.
As discussed offline, ct_state=new|trk seems non-specific.
Ideally, there should be another user level state like no_trk or passthru., for examples,
so the user is made aware of the behavior and can use a stateless approach.

    >+icmp6,vlan_tci=0x0000,dl_src=56:5b:b3:68:8f:09,dl_dst=e6:6f:38:26:34:c4,ipv6_src=fc00::2,ipv6_dst=fc00::1,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=136,icmp_code=0,nd_target=fc00::2,nd_sll=00:00:00:00:00:00,nd_tll=56:5b:b3:68:8f:09 icmp6_csum:88ce

    >    +])

    >    +

    >    +OVS_TRAFFIC_VSWITCHD_STOP

    >    +AT_CLEANUP

    >    +

    >     AT_SETUP([conntrack - preserve registers])

    >     CHECK_CONNTRACK()

    >     OVS_TRAFFIC_VSWITCHD_START()

    >    -- 

    >    2.10.2

    >    

    >    

    >
diff mbox

Patch

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 9bea3d9..86228d6 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -52,9 +52,17 @@  struct conn_lookup_ctx {
     bool related;
 };
 
-static bool conn_key_extract(struct conntrack *, struct dp_packet *,
-                             ovs_be16 dl_type, struct conn_lookup_ctx *,
-                             uint16_t zone);
+enum key_status {
+    KEY_INVALID,   /* Could not extract the connection key: invalid. */
+    KEY_OK,        /* Connection key is ok. */
+    KEY_NO_TRACK,  /* Connection key is ok, but it should not be tracked. */
+};
+
+static enum key_status conn_key_extract(struct conntrack *,
+                                        struct dp_packet *,
+                                        ovs_be16 dl_type,
+                                        struct conn_lookup_ctx *,
+                                        uint16_t zone);
 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
 static void conn_key_reverse(struct conn_key *);
 static void conn_key_lookup(struct conntrack_bucket *ctb,
@@ -157,6 +165,20 @@  static unsigned hash_to_bucket(uint32_t hash)
     return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
 }
 
+static uint16_t
+key_status_to_cs(enum key_status s)
+{
+    switch (s) {
+    case KEY_INVALID:
+        return CS_INVALID;
+    case KEY_OK:
+    case KEY_NO_TRACK:
+        return CS_NEW;
+    default:
+        OVS_NOT_REACHED();
+    }
+}
+
 static void
 write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
             uint32_t mark, ovs_u128 label)
@@ -303,10 +325,13 @@  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
 
     memset(bucket_list, INT8_C(-1), sizeof bucket_list);
     for (i = 0; i < cnt; i++) {
+        enum key_status extract_res;
         unsigned bucket;
 
-        if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
-            write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
+        extract_res = conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone);
+        if (extract_res != KEY_OK) {
+            write_ct_md(pkts[i], key_status_to_cs(extract_res), zone, 0,
+                        OVS_U128_ZERO);
             continue;
         }
 
@@ -693,8 +718,11 @@  extract_l4_udp(struct conn_key *key, const void *data, size_t size)
     return key->src.port && key->dst.port;
 }
 
-static inline bool extract_l4(struct conn_key *key, const void *data,
-                              size_t size, bool *related, const void *l3);
+static inline enum key_status extract_l4(struct conn_key *key,
+                                         const void *data,
+                                         size_t size,
+                                         bool *related,
+                                         const void *l3);
 
 static uint8_t
 reverse_icmp_type(uint8_t type)
@@ -724,14 +752,14 @@  reverse_icmp_type(uint8_t type)
  * instead and set *related to true.  If 'related' is NULL we're
  * already processing a nested header and no such recursion is
  * possible */
-static inline int
+static inline enum key_status
 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
                 bool *related)
 {
     const struct icmp_header *icmp = data;
 
     if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
-        return false;
+        return KEY_INVALID;
     }
 
     switch (icmp->icmp_type) {
@@ -742,13 +770,15 @@  extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
     case ICMP4_INFOREQUEST:
     case ICMP4_INFOREPLY:
         if (icmp->icmp_code != 0) {
-            return false;
+            return KEY_INVALID;
         }
         /* Separate ICMP connection: identified using id */
         key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
         key->src.icmp_type = icmp->icmp_type;
         key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
-        break;
+
+        return KEY_OK;
+
     case ICMP4_DST_UNREACH:
     case ICMP4_TIME_EXCEEDED:
     case ICMP4_PARAM_PROB:
@@ -760,41 +790,40 @@  extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
         const char *l3 = (const char *) (icmp + 1);
         const char *tail = (const char *) data + size;
         const char *l4;
+        enum key_status res;
         bool ok;
 
         if (!related) {
-            return false;
+            return KEY_INVALID;
         }
 
         memset(&inner_key, 0, sizeof inner_key);
         inner_key.dl_type = htons(ETH_TYPE_IP);
         ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
         if (!ok) {
-            return false;
+            return KEY_INVALID;
         }
 
         /* pf doesn't do this, but it seems a good idea */
         if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
             || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
-            return false;
+            return KEY_INVALID;
         }
 
         key->src = inner_key.src;
         key->dst = inner_key.dst;
         key->nw_proto = inner_key.nw_proto;
 
-        ok = extract_l4(key, l4, tail - l4, NULL, l3);
-        if (ok) {
+        res = extract_l4(key, l4, tail - l4, NULL, l3);
+        if (res != KEY_INVALID) {
             conn_key_reverse(key);
             *related = true;
         }
-        return ok;
+        return res;
     }
     default:
-        return false;
+        return KEY_INVALID;
     }
-
-    return true;
 }
 
 static uint8_t
@@ -815,7 +844,7 @@  reverse_icmp6_type(uint8_t type)
  * instead and set *related to true.  If 'related' is NULL we're
  * already processing a nested header and no such recursion is
  * possible */
-static inline bool
+static inline enum key_status
 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
                  bool *related)
 {
@@ -824,20 +853,22 @@  extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
     /* All the messages that we support need at least 4 bytes after
      * the header */
     if (size < sizeof *icmp6 + 4) {
-        return false;
+        return KEY_INVALID;
     }
 
     switch (icmp6->icmp6_type) {
     case ICMP6_ECHO_REQUEST:
     case ICMP6_ECHO_REPLY:
         if (icmp6->icmp6_code != 0) {
-            return false;
+            return KEY_INVALID;
         }
         /* Separate ICMP connection: identified using id */
         key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
         key->src.icmp_type = icmp6->icmp6_type;
         key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
-        break;
+
+        return KEY_OK;
+
     case ICMP6_DST_UNREACH:
     case ICMP6_PACKET_TOO_BIG:
     case ICMP6_TIME_EXCEEDED:
@@ -848,17 +879,18 @@  extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
         const char *l3 = (const char *) icmp6 + 8;
         const char *tail = (const char *) data + size;
         const char *l4 = NULL;
+        enum key_status res;
         bool ok;
 
         if (!related) {
-            return false;
+            return KEY_INVALID;
         }
 
         memset(&inner_key, 0, sizeof inner_key);
         inner_key.dl_type = htons(ETH_TYPE_IPV6);
         ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
         if (!ok) {
-            return false;
+            return KEY_INVALID;
         }
 
         /* pf doesn't do this, but it seems a good idea */
@@ -866,25 +898,32 @@  extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
                               &key->dst.addr.ipv6_aligned)
             || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
                                  &key->src.addr.ipv6_aligned)) {
-            return false;
+            return KEY_INVALID;
         }
 
         key->src = inner_key.src;
         key->dst = inner_key.dst;
         key->nw_proto = inner_key.nw_proto;
 
-        ok = extract_l4(key, l4, tail - l4, NULL, l3);
-        if (ok) {
+        res = extract_l4(key, l4, tail - l4, NULL, l3);
+        if (res != KEY_INVALID) {
             conn_key_reverse(key);
             *related = true;
         }
-        return ok;
+        return res;
     }
+    case ND_NEIGHBOR_SOLICIT:
+    case ND_NEIGHBOR_ADVERT:
+    case ND_ROUTER_SOLICIT:
+    case ND_ROUTER_ADVERT:
+        if (icmp6->icmp6_code != 0) {
+            return KEY_INVALID;
+        }
+        /* This packet is not going to be tracked. */
+        return KEY_NO_TRACK;
     default:
-        return false;
+        return KEY_INVALID;
     }
-
-    return true;
 }
 
 /* Extract l4 fields into 'key', which must already contain valid l3
@@ -896,30 +935,34 @@  extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
  *
  * If 'related' is NULL, it means that we're already parsing a header nested
  * in an ICMP error.  In this case, we skip checksum and length validation. */
-static inline bool
+static inline enum key_status
 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
            const void *l3)
 {
     if (key->nw_proto == IPPROTO_TCP) {
         return (!related || check_l4_tcp(key, data, size, l3))
-               && extract_l4_tcp(key, data, size);
+               ? extract_l4_tcp(key, data, size)
+               : KEY_INVALID;
     } else if (key->nw_proto == IPPROTO_UDP) {
         return (!related || check_l4_udp(key, data, size, l3))
-               && extract_l4_udp(key, data, size);
+               ? extract_l4_udp(key, data, size)
+               : KEY_INVALID;
     } else if (key->dl_type == htons(ETH_TYPE_IP)
                && key->nw_proto == IPPROTO_ICMP) {
         return (!related || check_l4_icmp(data, size))
-               && extract_l4_icmp(key, data, size, related);
+               ? extract_l4_icmp(key, data, size, related)
+               : KEY_INVALID;
     } else if (key->dl_type == htons(ETH_TYPE_IPV6)
                && key->nw_proto == IPPROTO_ICMPV6) {
         return (!related || check_l4_icmp6(key, data, size, l3))
-               && extract_l4_icmp6(key, data, size, related);
+               ? extract_l4_icmp6(key, data, size, related)
+               : KEY_INVALID;
     } else {
-        return false;
+        return KEY_INVALID;
     }
 }
 
-static bool
+static enum key_status
 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
                  struct conn_lookup_ctx *ctx, uint16_t zone)
 {
@@ -932,7 +975,7 @@  conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
     memset(ctx, 0, sizeof *ctx);
 
     if (!l2 || !l3 || !l4) {
-        return false;
+        return KEY_INVALID;
     }
 
     ctx->key.zone = zone;
@@ -979,13 +1022,16 @@  conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
     }
 
     if (ok) {
-        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
+        enum key_status res;
+
+        res = extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3);
+        if (res == KEY_OK) {
             ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
-            return true;
         }
+        return res;
     }
 
-    return false;
+    return KEY_INVALID;
 }
 
 /* Symmetric */
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 5f594be..5b70032 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -8626,38 +8626,6 @@  OFPT_ECHO_REQUEST (xid=0x0): 0 bytes of payload
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
-AT_SETUP([ofproto-dpif - conntrack - untrackable traffic])
-OVS_VSWITCHD_START
-
-add_of_ports br0 1 2
-
-AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])
-
-AT_DATA([flows.txt], [dnl
-ipv6,ct_state=-trk,action=ct(table=0,zone=0)
-ct_state=+trk,action=controller
-])
-
-AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
-
-AT_CAPTURE_FILE([ofctl_monitor.log])
-AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach --no-chdir --pidfile 2> ofctl_monitor.log])
-
-AT_CHECK([ovs-appctl time/stop])
-
-AT_CHECK([ovs-appctl netdev-dummy/receive p2 '0060970769ea0000860580da86dd6000000000203afffe80000000000000020086fffe0580dafe80000000000000026097fffe0769ea870068bd00000000fe80000000000000026097fffe0769ea01010000860580da'])
-
-OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 1])
-OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
-
-AT_CHECK([cat ofctl_monitor.log], [0], [dnl
-NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=86 ct_state=inv|trk,in_port=2 (via action) data_len=86 (unbuffered)
-icmp6,vlan_tci=0x0000,dl_src=00:00:86:05:80:da,dl_dst=00:60:97:07:69:ea,ipv6_src=fe80::200:86ff:fe05:80da,ipv6_dst=fe80::260:97ff:fe07:69ea,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fe80::260:97ff:fe07:69ea,nd_sll=00:00:86:05:80:da,nd_tll=00:00:00:00:00:00 icmp6_csum:68bd
-])
-
-OVS_VSWITCHD_STOP
-AT_CLEANUP
-
 AT_SETUP([ofproto-dpif - conntrack - zones])
 OVS_VSWITCHD_START
 
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index a5023d3..7d86c81 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -740,6 +740,41 @@  icmpv6,orig=(src=fc00::1,dst=fc00::2,id=<cleared>,type=128,code=0),reply=(src=fc
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - neighbor discovery])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START()
+
+dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0.
+AT_DATA([flows.txt], [dnl
+table=0,priority=1,action=drop
+table=0,priority=100,ip6,action=ct(table=1)
+dnl
+table=1,priority=100,ip6,action=controller
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+AT_CAPTURE_FILE([ofctl_monitor.log])
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+dnl e6:6f:38:26:34:c4 > 33:33:ff:00:00:02, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::1 > ff02::1:ff00:2: [icmp6 sum ok] ICMP6, neighbor solicitation, length 32, who has fc00::2
+dnl     source link-address option (1), length 8 (1): e6:6f:38:26:34:c4
+AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' '3333ff000002e66f382634c486dd6000000000203afffc000000000000000000000000000001ff0200000000000000000001ff00000287002e3e00000000fc0000000000000000000000000000020101e66f382634c4'])
+
+dnl 56:5b:b3:68:8f:09 > e6:6f:38:26:34:c4, ethertype IPv6 (0x86dd), length 86: (hlim 255, next-header ICMPv6 (58) payload length: 32) fc00::2 > fc00::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 32, tgt is fc00::2, Flags [solicited, override]
+dnl     destination link-address option (2), length 8 (1): 56:5b:b3:68:8f:09
+AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 LOCAL 'resubmit(,0)' 'e66f382634c4565bb3688f0986dd6000000000203afffc000000000000000000000000000002fc000000000000000000000000000001880088ce60000000fc0000000000000000000000000000020201565bb3688f09'])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action) data_len=86 (unbuffered)
+icmp6,vlan_tci=0x0000,dl_src=e6:6f:38:26:34:c4,dl_dst=33:33:ff:00:00:02,ipv6_src=fc00::1,ipv6_dst=ff02::1:ff00:2,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=fc00::2,nd_sll=e6:6f:38:26:34:c4,nd_tll=00:00:00:00:00:00 icmp6_csum:2e3e
+NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=86 ct_state=new|trk,in_port=LOCAL (via action) data_len=86 (unbuffered)
+icmp6,vlan_tci=0x0000,dl_src=56:5b:b3:68:8f:09,dl_dst=e6:6f:38:26:34:c4,ipv6_src=fc00::2,ipv6_dst=fc00::1,ipv6_label=0x00000,nw_tos=0,nw_ecn=0,nw_ttl=255,icmp_type=136,icmp_code=0,nd_target=fc00::2,nd_sll=00:00:00:00:00:00,nd_tll=56:5b:b3:68:8f:09 icmp6_csum:88ce
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([conntrack - preserve registers])
 CHECK_CONNTRACK()
 OVS_TRAFFIC_VSWITCHD_START()