diff mbox series

[ovs-dev,4/4] ovn-controller: Handle DNAT/no-NAT conntrack tuple collisions.

Message ID 20210603150603.32741.64731.stgit@dceara.remote.csb
State Changes Requested
Headers show
Series Handle DNAT/no-NAT conntrack tuple collisions if possible. | expand

Commit Message

Dumitru Ceara June 3, 2021, 3:06 p.m. UTC
Assuming a load balancer, LB1, with:
- VIP: 42.42.42.42:4242
- backend: 42.42.42.1:2121

A client might connect to the backend either directly or through the
VIP.  If the first connection is via the VIP and the second connection
is direct but happens to use the same source L4 port, OVN should make
sure that the second connection is SNATed (source port translation) in
order to avoid a tuple collision at commit time.

For example:
1. Session via the VIP:
   - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
   - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
2. Session directly connected to the backend:
   - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
   - in acl stage committing this connection would fail.

In order to avoid this we now use the all-zero-ip NAT OVS feature when
committing conneections in the ACL stage.  This translates to a no-op
SNAT when there's no tuple collision, and performs source port
translation when a tuple collision would happen.

We program flows to perform all-zero-ip NAT conditionally, only if the
datapath being used supports it.

Reported-at: https://bugzilla.redhat.com/1939676
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 controller/lflow.c            |    1 
 include/ovn/actions.h         |    4 +
 lib/actions.c                 |   34 ++++++++++
 tests/ovn.at                  |    2 -
 tests/system-common-macros.at |    4 +
 tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 180 insertions(+), 3 deletions(-)

Comments

Mark Gray June 4, 2021, 2:51 p.m. UTC | #1
On 03/06/2021 16:06, Dumitru Ceara wrote:
> Assuming a load balancer, LB1, with:
> - VIP: 42.42.42.42:4242
> - backend: 42.42.42.1:2121
> 
> A client might connect to the backend either directly or through the
> VIP.  If the first connection is via the VIP and the second connection
> is direct but happens to use the same source L4 port, OVN should make
> sure that the second connection is SNATed (source port translation) in
> order to avoid a tuple collision at commit time.
> 
> For example:
> 1. Session via the VIP:
>    - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
>    - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
> 2. Session directly connected to the backend:
>    - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
>    - in acl stage committing this connection would fail.
> 
> In order to avoid this we now use the all-zero-ip NAT OVS feature when
> committing conneections in the ACL stage.  This translates to a no-op
> SNAT when there's no tuple collision, and performs source port
> translation when a tuple collision would happen.
> 
> We program flows to perform all-zero-ip NAT conditionally, only if the
> datapath being used supports it.
> 
> Reported-at: https://bugzilla.redhat.com/1939676
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> ---
>  controller/lflow.c            |    1 
>  include/ovn/actions.h         |    4 +
>  lib/actions.c                 |   34 ++++++++++
>  tests/ovn.at                  |    2 -
>  tests/system-common-macros.at |    4 +
>  tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 180 insertions(+), 3 deletions(-)
> 
> diff --git a/controller/lflow.c b/controller/lflow.c
> index 680b8cca1..af6a237d1 100644
> --- a/controller/lflow.c
> +++ b/controller/lflow.c
> @@ -582,6 +582,7 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
>          .is_switch = datapath_is_switch(dp),
>          .group_table = l_ctx_out->group_table,
>          .meter_table = l_ctx_out->meter_table,
> +        .dp_features = ovs_feature_support_get(),
>          .lflow_uuid = lflow->header_.uuid,
>  
>          .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
> index 040213177..35983498b 100644
> --- a/include/ovn/actions.h
> +++ b/include/ovn/actions.h
> @@ -25,6 +25,7 @@
>  #include "openvswitch/hmap.h"
>  #include "openvswitch/uuid.h"
>  #include "util.h"
> +#include "ovn/features.h"
>  
>  struct expr;
>  struct lexer;
> @@ -756,6 +757,9 @@ struct ovnact_encode_params {
>      /* A struct to figure out the meter_id for meter actions. */
>      struct ovn_extend_table *meter_table;
>  
> +    /* OVS datapath supported features. */
> +    enum ovs_feature_support dp_features;
> +
>      /* The logical flow uuid that drove this action. */
>      struct uuid lflow_uuid;
>  
> diff --git a/lib/actions.c b/lib/actions.c
> index b3433f49e..c8c2443ce 100644
> --- a/lib/actions.c
> +++ b/lib/actions.c
> @@ -732,7 +732,7 @@ format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
>  
>  static void
>  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
> +                    const struct ovnact_encode_params *ep,
>                      struct ofpbuf *ofpacts)
>  {
>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
> @@ -742,6 +742,21 @@ encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>      ct->zone_src.ofs = 0;
>      ct->zone_src.n_bits = 16;
>  
> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
> +     * collisions at commit time between NATed and firewalled-only sessions.
> +     */
> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
> +        size_t nat_offset = ofpacts->size;
> +        ofpbuf_pull(ofpacts, nat_offset);
> +
> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
> +        nat->flags = 0;
> +        nat->range_af = AF_UNSPEC;
> +        nat->flags |= NX_NAT_F_SRC;
> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
> +        ct = ofpacts->header;
> +    }
> +
>      size_t set_field_offset = ofpacts->size;
>      ofpbuf_pull(ofpacts, set_field_offset);
>  
> @@ -780,7 +795,7 @@ format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
>  
>  static void
>  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
> +                    const struct ovnact_encode_params *ep,
>                      struct ofpbuf *ofpacts)
>  {
>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
> @@ -792,6 +807,21 @@ encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>      ct->zone_src.ofs = 0;
>      ct->zone_src.n_bits = 16;
>  
> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
> +     * collisions at commit time between NATed and firewalled-only sessions.
> +     */
> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
> +        size_t nat_offset = ofpacts->size;
> +        ofpbuf_pull(ofpacts, nat_offset);
> +
> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
> +        nat->flags = 0;
> +        nat->range_af = AF_UNSPEC;
> +        nat->flags |= NX_NAT_F_SRC;
> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
> +        ct = ofpacts->header;
> +    }
> +
>      size_t set_field_offset = ofpacts->size;
>      ofpbuf_pull(ofpacts, set_field_offset);
>  
> diff --git a/tests/ovn.at b/tests/ovn.at
> index f26894ce4..638aa5010 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -23109,7 +23109,7 @@ AT_CHECK([
>      for hv in 1 2; do
>          grep table=15 hv${hv}flows | \
>          grep "priority=100" | \
> -        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
> +        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>  
>          grep table=22 hv${hv}flows | \
>          grep "priority=200" | \
> diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
> index c8fa6f03f..b742a2cb9 100644
> --- a/tests/system-common-macros.at
> +++ b/tests/system-common-macros.at
> @@ -330,3 +330,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP],
>  # OVS_CHECK_CT_CLEAR()
>  m4_define([OVS_CHECK_CT_CLEAR],
>      [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
> +
> +# OVS_CHECK_CT_ZERO_SNAT()
> +m4_define([OVS_CHECK_CT_ZERO_SNAT],
> +    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> index 310bd3d5a..61d6bfc86 100644
> --- a/tests/system-ovn.at
> +++ b/tests/system-ovn.at
> @@ -5296,6 +5296,144 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>  AT_CLEANUP
>  ])
>  
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
> +AT_KEYWORDS([ovnlb])
> +
> +CHECK_CONNTRACK()
> +CHECK_CONNTRACK_NAT()
> +ovn_start
> +OVS_TRAFFIC_VSWITCHD_START()
> +OVS_CHECK_CT_ZERO_SNAT()
> +ADD_BR([br-int])
> +
> +# Set external-ids in br-int needed for ovn-controller
> +ovs-vsctl \
> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
> +
> +# Start ovn-controller
> +start_daemon ovn-controller
> +
> +# Logical network:
> +# 1 logical switch connetected to one logical router.
> +# 2 VMs, one used as backend for a load balancer.
> +
> +check ovn-nbctl                                                  \
> +    -- lr-add rtr                                                \
> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
> +    -- ls-add ls                                                 \
> +    -- lsp-add ls ls-rtr                                         \
> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
> +    -- lsp-set-type ls-rtr router                                \
> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
> +    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
> +    -- ls-lb-add ls lb-test

Will this work for a DNAT entry instead of a load-balancer? As you know,
the flows are slightly different. Will this work when combined with
forcing SNAT? Do we need to check this?

> +
> +ADD_NAMESPACES(vm1)
> +ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
> +
> +ADD_NAMESPACES(vm2)
> +ADD_VETH(vm2, vm2, br-int, "42.42.42.3/24", "00:00:00:00:00:02", "42.42.42.1")
> +
> +# Wait for ovn-controller to catch up.
> +wait_for_ports_up
> +ovn-nbctl --wait=hv sync
> +
> +# Start IPv4 TCP server on vm1.
> +NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc-vm1.pid])
> +
> +# Make sure connecting to the VIP works.
> +NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
> +
> +# Start IPv4 TCP connection to VIP from vm2.
> +NETNS_DAEMONIZE([vm2], [nc 66.66.66.66 666 -p 2001], [nc1-vm2.pid])

I feel like there should also be some kind of ping test to check that
the return path works as expected. Is there a reason why you did not do
that?

> +
> +# Check conntrack.
> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
> +
> +# Start IPv4 TCP connection to backend IP from vm2 which would require
> +# additional source port translation to avoid a tuple conflict.
> +NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])

Should there be another check to ensure that the conntrack entries are
setup as expected?

> +
> +AT_CLEANUP
> +])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv6])
> +AT_KEYWORDS([ovnlb])
> +
> +CHECK_CONNTRACK()
> +CHECK_CONNTRACK_NAT()
> +ovn_start
> +OVS_TRAFFIC_VSWITCHD_START()
> +OVS_CHECK_CT_ZERO_SNAT()
> +ADD_BR([br-int])
> +
> +# Set external-ids in br-int needed for ovn-controller
> +ovs-vsctl \
> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
> +
> +# Start ovn-controller
> +start_daemon ovn-controller
> +
> +# Logical network:
> +# 1 logical switch connetected to one logical router.
> +# 2 VMs, one used as backend for a load balancer.
> +
> +check ovn-nbctl                                                  \
> +    -- lr-add rtr                                                \
> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 4242::1/64           \
> +    -- ls-add ls                                                 \
> +    -- lsp-add ls ls-rtr                                         \
> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
> +    -- lsp-set-type ls-rtr router                                \
> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
> +    -- lb-add lb-test [[6666::1]]:666 [[4242::2]]:4242 tcp       \
> +    -- ls-lb-add ls lb-test
> +
> +ADD_NAMESPACES(vm1)
> +ADD_VETH(vm1, vm1, br-int, "4242::2/64", "00:00:00:00:00:01", "4242::1")
> +OVS_WAIT_UNTIL([test "$(ip netns exec vm1 ip a | grep 4242::2 | grep tentative)" = ""])
> +
> +ADD_NAMESPACES(vm2)
> +ADD_VETH(vm2, vm2, br-int, "4242::3/64", "00:00:00:00:00:02", "4242::1")
> +OVS_WAIT_UNTIL([test "$(ip netns exec vm2 ip a | grep 4242::3 | grep tentative)" = ""])
> +
> +# Wait for ovn-controller to catch up.
> +wait_for_ports_up
> +ovn-nbctl --wait=hv sync
> +
> +# Start IPv6 TCP server on vm1.
> +NETNS_DAEMONIZE([vm1], [nc -k -l 4242::2 4242], [nc-vm1.pid])
> +
> +# Make sure connecting to the VIP works.
> +NS_CHECK_EXEC([vm2], [nc 6666::1 666 -p 2000 -z])
> +
> +# Start IPv6 TCP connection to VIP from vm2.
> +NETNS_DAEMONIZE([vm2], [nc 6666::1 666 -p 2001], [nc1-vm2.pid])
> +
> +# Check conntrack.
> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(4242::2)])
> +
> +# Start IPv6 TCP connection to backend IP from vm2 which would require
> +# additional source port translation to avoid a tuple conflict.
> +NS_CHECK_EXEC([vm2], [nc 4242::2 4242 -p 2001 -z])
> +
> +AT_CLEANUP
> +])
> +
>  # When a lport is released on a chassis, ovn-controller was
>  # not clearing some of the flowss in the table 33 leading
>  # to packet drops if ct() is hit.
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Dumitru Ceara June 4, 2021, 3:11 p.m. UTC | #2
On 6/4/21 4:51 PM, Mark Gray wrote:
> On 03/06/2021 16:06, Dumitru Ceara wrote:
>> Assuming a load balancer, LB1, with:
>> - VIP: 42.42.42.42:4242
>> - backend: 42.42.42.1:2121
>>
>> A client might connect to the backend either directly or through the
>> VIP.  If the first connection is via the VIP and the second connection
>> is direct but happens to use the same source L4 port, OVN should make
>> sure that the second connection is SNATed (source port translation) in
>> order to avoid a tuple collision at commit time.
>>
>> For example:
>> 1. Session via the VIP:
>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
>>    - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
>> 2. Session directly connected to the backend:
>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
>>    - in acl stage committing this connection would fail.
>>
>> In order to avoid this we now use the all-zero-ip NAT OVS feature when
>> committing conneections in the ACL stage.  This translates to a no-op
>> SNAT when there's no tuple collision, and performs source port
>> translation when a tuple collision would happen.
>>
>> We program flows to perform all-zero-ip NAT conditionally, only if the
>> datapath being used supports it.
>>
>> Reported-at: https://bugzilla.redhat.com/1939676
>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>> ---
>>  controller/lflow.c            |    1 
>>  include/ovn/actions.h         |    4 +
>>  lib/actions.c                 |   34 ++++++++++
>>  tests/ovn.at                  |    2 -
>>  tests/system-common-macros.at |    4 +
>>  tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 180 insertions(+), 3 deletions(-)
>>
>> diff --git a/controller/lflow.c b/controller/lflow.c
>> index 680b8cca1..af6a237d1 100644
>> --- a/controller/lflow.c
>> +++ b/controller/lflow.c
>> @@ -582,6 +582,7 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
>>          .is_switch = datapath_is_switch(dp),
>>          .group_table = l_ctx_out->group_table,
>>          .meter_table = l_ctx_out->meter_table,
>> +        .dp_features = ovs_feature_support_get(),
>>          .lflow_uuid = lflow->header_.uuid,
>>  
>>          .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
>> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
>> index 040213177..35983498b 100644
>> --- a/include/ovn/actions.h
>> +++ b/include/ovn/actions.h
>> @@ -25,6 +25,7 @@
>>  #include "openvswitch/hmap.h"
>>  #include "openvswitch/uuid.h"
>>  #include "util.h"
>> +#include "ovn/features.h"
>>  
>>  struct expr;
>>  struct lexer;
>> @@ -756,6 +757,9 @@ struct ovnact_encode_params {
>>      /* A struct to figure out the meter_id for meter actions. */
>>      struct ovn_extend_table *meter_table;
>>  
>> +    /* OVS datapath supported features. */
>> +    enum ovs_feature_support dp_features;
>> +
>>      /* The logical flow uuid that drove this action. */
>>      struct uuid lflow_uuid;
>>  
>> diff --git a/lib/actions.c b/lib/actions.c
>> index b3433f49e..c8c2443ce 100644
>> --- a/lib/actions.c
>> +++ b/lib/actions.c
>> @@ -732,7 +732,7 @@ format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
>>  
>>  static void
>>  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>> +                    const struct ovnact_encode_params *ep,
>>                      struct ofpbuf *ofpacts)
>>  {
>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>> @@ -742,6 +742,21 @@ encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>      ct->zone_src.ofs = 0;
>>      ct->zone_src.n_bits = 16;
>>  
>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>> +     * collisions at commit time between NATed and firewalled-only sessions.
>> +     */
>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>> +        size_t nat_offset = ofpacts->size;
>> +        ofpbuf_pull(ofpacts, nat_offset);
>> +
>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>> +        nat->flags = 0;
>> +        nat->range_af = AF_UNSPEC;
>> +        nat->flags |= NX_NAT_F_SRC;
>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>> +        ct = ofpacts->header;
>> +    }
>> +
>>      size_t set_field_offset = ofpacts->size;
>>      ofpbuf_pull(ofpacts, set_field_offset);
>>  
>> @@ -780,7 +795,7 @@ format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
>>  
>>  static void
>>  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>> +                    const struct ovnact_encode_params *ep,
>>                      struct ofpbuf *ofpacts)
>>  {
>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>> @@ -792,6 +807,21 @@ encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>      ct->zone_src.ofs = 0;
>>      ct->zone_src.n_bits = 16;
>>  
>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>> +     * collisions at commit time between NATed and firewalled-only sessions.
>> +     */
>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>> +        size_t nat_offset = ofpacts->size;
>> +        ofpbuf_pull(ofpacts, nat_offset);
>> +
>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>> +        nat->flags = 0;
>> +        nat->range_af = AF_UNSPEC;
>> +        nat->flags |= NX_NAT_F_SRC;
>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>> +        ct = ofpacts->header;
>> +    }
>> +
>>      size_t set_field_offset = ofpacts->size;
>>      ofpbuf_pull(ofpacts, set_field_offset);
>>  
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index f26894ce4..638aa5010 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -23109,7 +23109,7 @@ AT_CHECK([
>>      for hv in 1 2; do
>>          grep table=15 hv${hv}flows | \
>>          grep "priority=100" | \
>> -        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>> +        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>  
>>          grep table=22 hv${hv}flows | \
>>          grep "priority=200" | \
>> diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
>> index c8fa6f03f..b742a2cb9 100644
>> --- a/tests/system-common-macros.at
>> +++ b/tests/system-common-macros.at
>> @@ -330,3 +330,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP],
>>  # OVS_CHECK_CT_CLEAR()
>>  m4_define([OVS_CHECK_CT_CLEAR],
>>      [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
>> +
>> +# OVS_CHECK_CT_ZERO_SNAT()
>> +m4_define([OVS_CHECK_CT_ZERO_SNAT],
>> +    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>> index 310bd3d5a..61d6bfc86 100644
>> --- a/tests/system-ovn.at
>> +++ b/tests/system-ovn.at
>> @@ -5296,6 +5296,144 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>>  AT_CLEANUP
>>  ])
>>  
>> +OVN_FOR_EACH_NORTHD([
>> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
>> +AT_KEYWORDS([ovnlb])
>> +
>> +CHECK_CONNTRACK()
>> +CHECK_CONNTRACK_NAT()
>> +ovn_start
>> +OVS_TRAFFIC_VSWITCHD_START()
>> +OVS_CHECK_CT_ZERO_SNAT()
>> +ADD_BR([br-int])
>> +
>> +# Set external-ids in br-int needed for ovn-controller
>> +ovs-vsctl \
>> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
>> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
>> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
>> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
>> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
>> +
>> +# Start ovn-controller
>> +start_daemon ovn-controller
>> +
>> +# Logical network:
>> +# 1 logical switch connetected to one logical router.
>> +# 2 VMs, one used as backend for a load balancer.
>> +
>> +check ovn-nbctl                                                  \
>> +    -- lr-add rtr                                                \
>> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
>> +    -- ls-add ls                                                 \
>> +    -- lsp-add ls ls-rtr                                         \
>> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
>> +    -- lsp-set-type ls-rtr router                                \
>> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
>> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
>> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
>> +    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
>> +    -- ls-lb-add ls lb-test
> 
> Will this work for a DNAT entry instead of a load-balancer? As you know,
> the flows are slightly different. Will this work when combined with
> forcing SNAT? Do we need to check this?

We already have system tests that test DNAT and load balancer force
SNAT.  I thought we're already covered by those.

> 
>> +
>> +ADD_NAMESPACES(vm1)
>> +ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
>> +
>> +ADD_NAMESPACES(vm2)
>> +ADD_VETH(vm2, vm2, br-int, "42.42.42.3/24", "00:00:00:00:00:02", "42.42.42.1")
>> +
>> +# Wait for ovn-controller to catch up.
>> +wait_for_ports_up
>> +ovn-nbctl --wait=hv sync
>> +
>> +# Start IPv4 TCP server on vm1.
>> +NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc-vm1.pid])
>> +
>> +# Make sure connecting to the VIP works.
>> +NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
>> +
>> +# Start IPv4 TCP connection to VIP from vm2.
>> +NETNS_DAEMONIZE([vm2], [nc 66.66.66.66 666 -p 2001], [nc1-vm2.pid])
> 
> I feel like there should also be some kind of ping test to check that
> the return path works as expected. Is there a reason why you did not do
> that?

The goal of the test is to make sure that the direct connection (towards
42.42.42.2:4242) below is established successfully.

For the connection via the VIP there's the part above that performs the
full TCP 3way handshake:

NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])

> 
>> +
>> +# Check conntrack.
>> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
>> +
>> +# Start IPv4 TCP connection to backend IP from vm2 which would require
>> +# additional source port translation to avoid a tuple conflict.
>> +NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])
> 
> Should there be another check to ensure that the conntrack entries are
> setup as expected?
> 

I initially wanted to add one but there are differences in the way the
TCP state machine is implemented in userspace conntrack vs kernel
conntrack and I couldn't come up with something that would pass on both.

I'll try some more though.

Thanks for the reviews!

Regards,
Dumitru
Mark Gray June 4, 2021, 4:11 p.m. UTC | #3
On 04/06/2021 16:11, Dumitru Ceara wrote:
> On 6/4/21 4:51 PM, Mark Gray wrote:
>> On 03/06/2021 16:06, Dumitru Ceara wrote:
>>> Assuming a load balancer, LB1, with:
>>> - VIP: 42.42.42.42:4242
>>> - backend: 42.42.42.1:2121
>>>
>>> A client might connect to the backend either directly or through the
>>> VIP.  If the first connection is via the VIP and the second connection
>>> is direct but happens to use the same source L4 port, OVN should make
>>> sure that the second connection is SNATed (source port translation) in
>>> order to avoid a tuple collision at commit time.
>>>
>>> For example:
>>> 1. Session via the VIP:
>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
>>>    - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
>>> 2. Session directly connected to the backend:
>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
>>>    - in acl stage committing this connection would fail.
>>>
>>> In order to avoid this we now use the all-zero-ip NAT OVS feature when
>>> committing conneections in the ACL stage.  This translates to a no-op
>>> SNAT when there's no tuple collision, and performs source port
>>> translation when a tuple collision would happen.
>>>
>>> We program flows to perform all-zero-ip NAT conditionally, only if the
>>> datapath being used supports it.
>>>
>>> Reported-at: https://bugzilla.redhat.com/1939676
>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>> ---
>>>  controller/lflow.c            |    1 
>>>  include/ovn/actions.h         |    4 +
>>>  lib/actions.c                 |   34 ++++++++++
>>>  tests/ovn.at                  |    2 -
>>>  tests/system-common-macros.at |    4 +
>>>  tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
>>>  6 files changed, 180 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/controller/lflow.c b/controller/lflow.c
>>> index 680b8cca1..af6a237d1 100644
>>> --- a/controller/lflow.c
>>> +++ b/controller/lflow.c
>>> @@ -582,6 +582,7 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
>>>          .is_switch = datapath_is_switch(dp),
>>>          .group_table = l_ctx_out->group_table,
>>>          .meter_table = l_ctx_out->meter_table,
>>> +        .dp_features = ovs_feature_support_get(),
>>>          .lflow_uuid = lflow->header_.uuid,
>>>  
>>>          .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
>>> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
>>> index 040213177..35983498b 100644
>>> --- a/include/ovn/actions.h
>>> +++ b/include/ovn/actions.h
>>> @@ -25,6 +25,7 @@
>>>  #include "openvswitch/hmap.h"
>>>  #include "openvswitch/uuid.h"
>>>  #include "util.h"
>>> +#include "ovn/features.h"
>>>  
>>>  struct expr;
>>>  struct lexer;
>>> @@ -756,6 +757,9 @@ struct ovnact_encode_params {
>>>      /* A struct to figure out the meter_id for meter actions. */
>>>      struct ovn_extend_table *meter_table;
>>>  
>>> +    /* OVS datapath supported features. */
>>> +    enum ovs_feature_support dp_features;
>>> +
>>>      /* The logical flow uuid that drove this action. */
>>>      struct uuid lflow_uuid;
>>>  
>>> diff --git a/lib/actions.c b/lib/actions.c
>>> index b3433f49e..c8c2443ce 100644
>>> --- a/lib/actions.c
>>> +++ b/lib/actions.c
>>> @@ -732,7 +732,7 @@ format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
>>>  
>>>  static void
>>>  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>> +                    const struct ovnact_encode_params *ep,
>>>                      struct ofpbuf *ofpacts)
>>>  {
>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>> @@ -742,6 +742,21 @@ encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>>      ct->zone_src.ofs = 0;
>>>      ct->zone_src.n_bits = 16;
>>>  
>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>> +     */
>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>> +        size_t nat_offset = ofpacts->size;
>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>> +
>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>> +        nat->flags = 0;
>>> +        nat->range_af = AF_UNSPEC;
>>> +        nat->flags |= NX_NAT_F_SRC;
>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>> +        ct = ofpacts->header;
>>> +    }
>>> +
>>>      size_t set_field_offset = ofpacts->size;
>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>  
>>> @@ -780,7 +795,7 @@ format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
>>>  
>>>  static void
>>>  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>> +                    const struct ovnact_encode_params *ep,
>>>                      struct ofpbuf *ofpacts)
>>>  {
>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>> @@ -792,6 +807,21 @@ encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>>      ct->zone_src.ofs = 0;
>>>      ct->zone_src.n_bits = 16;
>>>  
>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>> +     */
>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>> +        size_t nat_offset = ofpacts->size;
>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>> +
>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>> +        nat->flags = 0;
>>> +        nat->range_af = AF_UNSPEC;
>>> +        nat->flags |= NX_NAT_F_SRC;
>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>> +        ct = ofpacts->header;
>>> +    }
>>> +
>>>      size_t set_field_offset = ofpacts->size;
>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>  
>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>> index f26894ce4..638aa5010 100644
>>> --- a/tests/ovn.at
>>> +++ b/tests/ovn.at
>>> @@ -23109,7 +23109,7 @@ AT_CHECK([
>>>      for hv in 1 2; do
>>>          grep table=15 hv${hv}flows | \
>>>          grep "priority=100" | \
>>> -        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>> +        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>>  
>>>          grep table=22 hv${hv}flows | \
>>>          grep "priority=200" | \
>>> diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
>>> index c8fa6f03f..b742a2cb9 100644
>>> --- a/tests/system-common-macros.at
>>> +++ b/tests/system-common-macros.at
>>> @@ -330,3 +330,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP],
>>>  # OVS_CHECK_CT_CLEAR()
>>>  m4_define([OVS_CHECK_CT_CLEAR],
>>>      [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
>>> +
>>> +# OVS_CHECK_CT_ZERO_SNAT()
>>> +m4_define([OVS_CHECK_CT_ZERO_SNAT],
>>> +    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
>>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>>> index 310bd3d5a..61d6bfc86 100644
>>> --- a/tests/system-ovn.at
>>> +++ b/tests/system-ovn.at
>>> @@ -5296,6 +5296,144 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>>>  AT_CLEANUP
>>>  ])
>>>  
>>> +OVN_FOR_EACH_NORTHD([
>>> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
>>> +AT_KEYWORDS([ovnlb])
>>> +
>>> +CHECK_CONNTRACK()
>>> +CHECK_CONNTRACK_NAT()
>>> +ovn_start
>>> +OVS_TRAFFIC_VSWITCHD_START()
>>> +OVS_CHECK_CT_ZERO_SNAT()
>>> +ADD_BR([br-int])
>>> +
>>> +# Set external-ids in br-int needed for ovn-controller
>>> +ovs-vsctl \
>>> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
>>> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
>>> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
>>> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
>>> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
>>> +
>>> +# Start ovn-controller
>>> +start_daemon ovn-controller
>>> +
>>> +# Logical network:
>>> +# 1 logical switch connetected to one logical router.
>>> +# 2 VMs, one used as backend for a load balancer.
>>> +
>>> +check ovn-nbctl                                                  \
>>> +    -- lr-add rtr                                                \
>>> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
>>> +    -- ls-add ls                                                 \
>>> +    -- lsp-add ls ls-rtr                                         \
>>> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
>>> +    -- lsp-set-type ls-rtr router                                \
>>> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
>>> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
>>> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
>>> +    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
>>> +    -- ls-lb-add ls lb-test
>>
>> Will this work for a DNAT entry instead of a load-balancer? As you know,
>> the flows are slightly different. Will this work when combined with
>> forcing SNAT? Do we need to check this?
> 
> We already have system tests that test DNAT and load balancer force
> SNAT.  I thought we're already covered by those.

In those cases we don't have tests in which we are reusing the source
port as we are in this test. What I am wondering is that when we have
DNAT entries or force SNAT, this will generate different Openflow rules
which may cause the OVS datapath flows to look slightly different which
may cause differences in behaviour not covered by this test.

Please correct me if I am wrong or if you think it is unnecessary. I am
a little paranoid about the OVN DNAT/SNAT/LB router code as I was
recently working on a bug in this area and I think more tests in this
area may be useful.

> 
>>
>>> +
>>> +ADD_NAMESPACES(vm1)
>>> +ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
>>> +
>>> +ADD_NAMESPACES(vm2)
>>> +ADD_VETH(vm2, vm2, br-int, "42.42.42.3/24", "00:00:00:00:00:02", "42.42.42.1")
>>> +
>>> +# Wait for ovn-controller to catch up.
>>> +wait_for_ports_up
>>> +ovn-nbctl --wait=hv sync
>>> +
>>> +# Start IPv4 TCP server on vm1.
>>> +NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc-vm1.pid])
>>> +
>>> +# Make sure connecting to the VIP works.
>>> +NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
>>> +
>>> +# Start IPv4 TCP connection to VIP from vm2.
>>> +NETNS_DAEMONIZE([vm2], [nc 66.66.66.66 666 -p 2001], [nc1-vm2.pid])
>>
>> I feel like there should also be some kind of ping test to check that
>> the return path works as expected. Is there a reason why you did not do
>> that?
> 
> The goal of the test is to make sure that the direct connection (towards
> 42.42.42.2:4242) below is established successfully.
> 
> For the connection via the VIP there's the part above that performs the
> full TCP 3way handshake:
> 
> NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
> 
>>
>>> +
>>> +# Check conntrack.
>>> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
>>> +
>>> +# Start IPv4 TCP connection to backend IP from vm2 which would require
>>> +# additional source port translation to avoid a tuple conflict.
>>> +NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])
>>
>> Should there be another check to ensure that the conntrack entries are
>> setup as expected?
>>
> 
> I initially wanted to add one but there are differences in the way the
> TCP state machine is implemented in userspace conntrack vs kernel
> conntrack and I couldn't come up with something that would pass on both.
> 
> I'll try some more though.

I think it would be useful. As I mentioned above I am a bit paranoid
about OVN DNAT/LB/SNAT :)
> 
> Thanks for the reviews!
> 
> Regards,
> Dumitru
>
Dumitru Ceara June 4, 2021, 4:24 p.m. UTC | #4
On 6/4/21 6:11 PM, Mark Gray wrote:
> On 04/06/2021 16:11, Dumitru Ceara wrote:
>> On 6/4/21 4:51 PM, Mark Gray wrote:
>>> On 03/06/2021 16:06, Dumitru Ceara wrote:
>>>> Assuming a load balancer, LB1, with:
>>>> - VIP: 42.42.42.42:4242
>>>> - backend: 42.42.42.1:2121
>>>>
>>>> A client might connect to the backend either directly or through the
>>>> VIP.  If the first connection is via the VIP and the second connection
>>>> is direct but happens to use the same source L4 port, OVN should make
>>>> sure that the second connection is SNATed (source port translation) in
>>>> order to avoid a tuple collision at commit time.
>>>>
>>>> For example:
>>>> 1. Session via the VIP:
>>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
>>>>    - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
>>>> 2. Session directly connected to the backend:
>>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
>>>>    - in acl stage committing this connection would fail.
>>>>
>>>> In order to avoid this we now use the all-zero-ip NAT OVS feature when
>>>> committing conneections in the ACL stage.  This translates to a no-op
>>>> SNAT when there's no tuple collision, and performs source port
>>>> translation when a tuple collision would happen.
>>>>
>>>> We program flows to perform all-zero-ip NAT conditionally, only if the
>>>> datapath being used supports it.
>>>>
>>>> Reported-at: https://bugzilla.redhat.com/1939676
>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>> ---
>>>>  controller/lflow.c            |    1 
>>>>  include/ovn/actions.h         |    4 +
>>>>  lib/actions.c                 |   34 ++++++++++
>>>>  tests/ovn.at                  |    2 -
>>>>  tests/system-common-macros.at |    4 +
>>>>  tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
>>>>  6 files changed, 180 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/controller/lflow.c b/controller/lflow.c
>>>> index 680b8cca1..af6a237d1 100644
>>>> --- a/controller/lflow.c
>>>> +++ b/controller/lflow.c
>>>> @@ -582,6 +582,7 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
>>>>          .is_switch = datapath_is_switch(dp),
>>>>          .group_table = l_ctx_out->group_table,
>>>>          .meter_table = l_ctx_out->meter_table,
>>>> +        .dp_features = ovs_feature_support_get(),
>>>>          .lflow_uuid = lflow->header_.uuid,
>>>>  
>>>>          .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
>>>> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
>>>> index 040213177..35983498b 100644
>>>> --- a/include/ovn/actions.h
>>>> +++ b/include/ovn/actions.h
>>>> @@ -25,6 +25,7 @@
>>>>  #include "openvswitch/hmap.h"
>>>>  #include "openvswitch/uuid.h"
>>>>  #include "util.h"
>>>> +#include "ovn/features.h"
>>>>  
>>>>  struct expr;
>>>>  struct lexer;
>>>> @@ -756,6 +757,9 @@ struct ovnact_encode_params {
>>>>      /* A struct to figure out the meter_id for meter actions. */
>>>>      struct ovn_extend_table *meter_table;
>>>>  
>>>> +    /* OVS datapath supported features. */
>>>> +    enum ovs_feature_support dp_features;
>>>> +
>>>>      /* The logical flow uuid that drove this action. */
>>>>      struct uuid lflow_uuid;
>>>>  
>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>> index b3433f49e..c8c2443ce 100644
>>>> --- a/lib/actions.c
>>>> +++ b/lib/actions.c
>>>> @@ -732,7 +732,7 @@ format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
>>>>  
>>>>  static void
>>>>  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>>> +                    const struct ovnact_encode_params *ep,
>>>>                      struct ofpbuf *ofpacts)
>>>>  {
>>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>>> @@ -742,6 +742,21 @@ encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>>>      ct->zone_src.ofs = 0;
>>>>      ct->zone_src.n_bits = 16;
>>>>  
>>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>>> +     */
>>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>>> +        size_t nat_offset = ofpacts->size;
>>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>>> +
>>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>>> +        nat->flags = 0;
>>>> +        nat->range_af = AF_UNSPEC;
>>>> +        nat->flags |= NX_NAT_F_SRC;
>>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>>> +        ct = ofpacts->header;
>>>> +    }
>>>> +
>>>>      size_t set_field_offset = ofpacts->size;
>>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>>  
>>>> @@ -780,7 +795,7 @@ format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
>>>>  
>>>>  static void
>>>>  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>>> +                    const struct ovnact_encode_params *ep,
>>>>                      struct ofpbuf *ofpacts)
>>>>  {
>>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>>> @@ -792,6 +807,21 @@ encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>>>      ct->zone_src.ofs = 0;
>>>>      ct->zone_src.n_bits = 16;
>>>>  
>>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>>> +     */
>>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>>> +        size_t nat_offset = ofpacts->size;
>>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>>> +
>>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>>> +        nat->flags = 0;
>>>> +        nat->range_af = AF_UNSPEC;
>>>> +        nat->flags |= NX_NAT_F_SRC;
>>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>>> +        ct = ofpacts->header;
>>>> +    }
>>>> +
>>>>      size_t set_field_offset = ofpacts->size;
>>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>>  
>>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>>> index f26894ce4..638aa5010 100644
>>>> --- a/tests/ovn.at
>>>> +++ b/tests/ovn.at
>>>> @@ -23109,7 +23109,7 @@ AT_CHECK([
>>>>      for hv in 1 2; do
>>>>          grep table=15 hv${hv}flows | \
>>>>          grep "priority=100" | \
>>>> -        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>>> +        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>>>  
>>>>          grep table=22 hv${hv}flows | \
>>>>          grep "priority=200" | \
>>>> diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
>>>> index c8fa6f03f..b742a2cb9 100644
>>>> --- a/tests/system-common-macros.at
>>>> +++ b/tests/system-common-macros.at
>>>> @@ -330,3 +330,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP],
>>>>  # OVS_CHECK_CT_CLEAR()
>>>>  m4_define([OVS_CHECK_CT_CLEAR],
>>>>      [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
>>>> +
>>>> +# OVS_CHECK_CT_ZERO_SNAT()
>>>> +m4_define([OVS_CHECK_CT_ZERO_SNAT],
>>>> +    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
>>>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>>>> index 310bd3d5a..61d6bfc86 100644
>>>> --- a/tests/system-ovn.at
>>>> +++ b/tests/system-ovn.at
>>>> @@ -5296,6 +5296,144 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>>>>  AT_CLEANUP
>>>>  ])
>>>>  
>>>> +OVN_FOR_EACH_NORTHD([
>>>> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
>>>> +AT_KEYWORDS([ovnlb])
>>>> +
>>>> +CHECK_CONNTRACK()
>>>> +CHECK_CONNTRACK_NAT()
>>>> +ovn_start
>>>> +OVS_TRAFFIC_VSWITCHD_START()
>>>> +OVS_CHECK_CT_ZERO_SNAT()
>>>> +ADD_BR([br-int])
>>>> +
>>>> +# Set external-ids in br-int needed for ovn-controller
>>>> +ovs-vsctl \
>>>> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
>>>> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
>>>> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
>>>> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
>>>> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
>>>> +
>>>> +# Start ovn-controller
>>>> +start_daemon ovn-controller
>>>> +
>>>> +# Logical network:
>>>> +# 1 logical switch connetected to one logical router.
>>>> +# 2 VMs, one used as backend for a load balancer.
>>>> +
>>>> +check ovn-nbctl                                                  \
>>>> +    -- lr-add rtr                                                \
>>>> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
>>>> +    -- ls-add ls                                                 \
>>>> +    -- lsp-add ls ls-rtr                                         \
>>>> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
>>>> +    -- lsp-set-type ls-rtr router                                \
>>>> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
>>>> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
>>>> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
>>>> +    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
>>>> +    -- ls-lb-add ls lb-test
>>>
>>> Will this work for a DNAT entry instead of a load-balancer? As you know,
>>> the flows are slightly different. Will this work when combined with
>>> forcing SNAT? Do we need to check this?
>>
>> We already have system tests that test DNAT and load balancer force
>> SNAT.  I thought we're already covered by those.
> 
> In those cases we don't have tests in which we are reusing the source
> port as we are in this test. What I am wondering is that when we have
> DNAT entries or force SNAT, this will generate different Openflow rules
> which may cause the OVS datapath flows to look slightly different which
> may cause differences in behaviour not covered by this test.
> 
> Please correct me if I am wrong or if you think it is unnecessary. I am
> a little paranoid about the OVN DNAT/SNAT/LB router code as I was
> recently working on a bug in this area and I think more tests in this
> area may be useful.

Sure, more tests definitely don't hurt.  I'll add some more in v2.

> 
>>
>>>
>>>> +
>>>> +ADD_NAMESPACES(vm1)
>>>> +ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
>>>> +
>>>> +ADD_NAMESPACES(vm2)
>>>> +ADD_VETH(vm2, vm2, br-int, "42.42.42.3/24", "00:00:00:00:00:02", "42.42.42.1")
>>>> +
>>>> +# Wait for ovn-controller to catch up.
>>>> +wait_for_ports_up
>>>> +ovn-nbctl --wait=hv sync
>>>> +
>>>> +# Start IPv4 TCP server on vm1.
>>>> +NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc-vm1.pid])
>>>> +
>>>> +# Make sure connecting to the VIP works.
>>>> +NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
>>>> +
>>>> +# Start IPv4 TCP connection to VIP from vm2.
>>>> +NETNS_DAEMONIZE([vm2], [nc 66.66.66.66 666 -p 2001], [nc1-vm2.pid])
>>>
>>> I feel like there should also be some kind of ping test to check that
>>> the return path works as expected. Is there a reason why you did not do
>>> that?
>>
>> The goal of the test is to make sure that the direct connection (towards
>> 42.42.42.2:4242) below is established successfully.
>>
>> For the connection via the VIP there's the part above that performs the
>> full TCP 3way handshake:
>>
>> NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
>>
>>>
>>>> +
>>>> +# Check conntrack.
>>>> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
>>>> +
>>>> +# Start IPv4 TCP connection to backend IP from vm2 which would require
>>>> +# additional source port translation to avoid a tuple conflict.
>>>> +NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])
>>>
>>> Should there be another check to ensure that the conntrack entries are
>>> setup as expected?
>>>
>>
>> I initially wanted to add one but there are differences in the way the
>> TCP state machine is implemented in userspace conntrack vs kernel
>> conntrack and I couldn't come up with something that would pass on both.
>>
>> I'll try some more though.
> 
> I think it would be useful. As I mentioned above I am a bit paranoid
> about OVN DNAT/LB/SNAT :)

:)

Thanks!
Dumitru Ceara June 9, 2021, 11:42 a.m. UTC | #5
On 6/4/21 6:24 PM, Dumitru Ceara wrote:
> On 6/4/21 6:11 PM, Mark Gray wrote:
>> On 04/06/2021 16:11, Dumitru Ceara wrote:
>>> On 6/4/21 4:51 PM, Mark Gray wrote:
>>>> On 03/06/2021 16:06, Dumitru Ceara wrote:
>>>>> Assuming a load balancer, LB1, with:
>>>>> - VIP: 42.42.42.42:4242
>>>>> - backend: 42.42.42.1:2121
>>>>>
>>>>> A client might connect to the backend either directly or through the
>>>>> VIP.  If the first connection is via the VIP and the second connection
>>>>> is direct but happens to use the same source L4 port, OVN should make
>>>>> sure that the second connection is SNATed (source port translation) in
>>>>> order to avoid a tuple collision at commit time.
>>>>>
>>>>> For example:
>>>>> 1. Session via the VIP:
>>>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.42:4242
>>>>>    - after DNAT:      src=42.42.42.2:2000, dst=42.42.42.1:2121
>>>>> 2. Session directly connected to the backend:
>>>>>    - original packet: src=42.42.42.2:2000, dst=42.42.42.1:2121
>>>>>    - in acl stage committing this connection would fail.
>>>>>
>>>>> In order to avoid this we now use the all-zero-ip NAT OVS feature when
>>>>> committing conneections in the ACL stage.  This translates to a no-op
>>>>> SNAT when there's no tuple collision, and performs source port
>>>>> translation when a tuple collision would happen.
>>>>>
>>>>> We program flows to perform all-zero-ip NAT conditionally, only if the
>>>>> datapath being used supports it.
>>>>>
>>>>> Reported-at: https://bugzilla.redhat.com/1939676
>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>> ---
>>>>>  controller/lflow.c            |    1 
>>>>>  include/ovn/actions.h         |    4 +
>>>>>  lib/actions.c                 |   34 ++++++++++
>>>>>  tests/ovn.at                  |    2 -
>>>>>  tests/system-common-macros.at |    4 +
>>>>>  tests/system-ovn.at           |  138 +++++++++++++++++++++++++++++++++++++++++
>>>>>  6 files changed, 180 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/controller/lflow.c b/controller/lflow.c
>>>>> index 680b8cca1..af6a237d1 100644
>>>>> --- a/controller/lflow.c
>>>>> +++ b/controller/lflow.c
>>>>> @@ -582,6 +582,7 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
>>>>>          .is_switch = datapath_is_switch(dp),
>>>>>          .group_table = l_ctx_out->group_table,
>>>>>          .meter_table = l_ctx_out->meter_table,
>>>>> +        .dp_features = ovs_feature_support_get(),
>>>>>          .lflow_uuid = lflow->header_.uuid,
>>>>>  
>>>>>          .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
>>>>> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
>>>>> index 040213177..35983498b 100644
>>>>> --- a/include/ovn/actions.h
>>>>> +++ b/include/ovn/actions.h
>>>>> @@ -25,6 +25,7 @@
>>>>>  #include "openvswitch/hmap.h"
>>>>>  #include "openvswitch/uuid.h"
>>>>>  #include "util.h"
>>>>> +#include "ovn/features.h"
>>>>>  
>>>>>  struct expr;
>>>>>  struct lexer;
>>>>> @@ -756,6 +757,9 @@ struct ovnact_encode_params {
>>>>>      /* A struct to figure out the meter_id for meter actions. */
>>>>>      struct ovn_extend_table *meter_table;
>>>>>  
>>>>> +    /* OVS datapath supported features. */
>>>>> +    enum ovs_feature_support dp_features;
>>>>> +
>>>>>      /* The logical flow uuid that drove this action. */
>>>>>      struct uuid lflow_uuid;
>>>>>  
>>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>>> index b3433f49e..c8c2443ce 100644
>>>>> --- a/lib/actions.c
>>>>> +++ b/lib/actions.c
>>>>> @@ -732,7 +732,7 @@ format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
>>>>>  
>>>>>  static void
>>>>>  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>>>> +                    const struct ovnact_encode_params *ep,
>>>>>                      struct ofpbuf *ofpacts)
>>>>>  {
>>>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>>>> @@ -742,6 +742,21 @@ encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
>>>>>      ct->zone_src.ofs = 0;
>>>>>      ct->zone_src.n_bits = 16;
>>>>>  
>>>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>>>> +     */
>>>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>>>> +        size_t nat_offset = ofpacts->size;
>>>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>>>> +
>>>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>>>> +        nat->flags = 0;
>>>>> +        nat->range_af = AF_UNSPEC;
>>>>> +        nat->flags |= NX_NAT_F_SRC;
>>>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>>>> +        ct = ofpacts->header;
>>>>> +    }
>>>>> +
>>>>>      size_t set_field_offset = ofpacts->size;
>>>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>>>  
>>>>> @@ -780,7 +795,7 @@ format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
>>>>>  
>>>>>  static void
>>>>>  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>>>> -                    const struct ovnact_encode_params *ep OVS_UNUSED,
>>>>> +                    const struct ovnact_encode_params *ep,
>>>>>                      struct ofpbuf *ofpacts)
>>>>>  {
>>>>>      struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
>>>>> @@ -792,6 +807,21 @@ encode_CT_COMMIT_V2(const struct ovnact_nest *on,
>>>>>      ct->zone_src.ofs = 0;
>>>>>      ct->zone_src.n_bits = 16;
>>>>>  
>>>>> +    /* If the datapath supports all-zero SNAT then use it to avoid tuple
>>>>> +     * collisions at commit time between NATed and firewalled-only sessions.
>>>>> +     */
>>>>> +    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
>>>>> +        size_t nat_offset = ofpacts->size;
>>>>> +        ofpbuf_pull(ofpacts, nat_offset);
>>>>> +
>>>>> +        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
>>>>> +        nat->flags = 0;
>>>>> +        nat->range_af = AF_UNSPEC;
>>>>> +        nat->flags |= NX_NAT_F_SRC;
>>>>> +        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
>>>>> +        ct = ofpacts->header;
>>>>> +    }
>>>>> +
>>>>>      size_t set_field_offset = ofpacts->size;
>>>>>      ofpbuf_pull(ofpacts, set_field_offset);
>>>>>  
>>>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>>>> index f26894ce4..638aa5010 100644
>>>>> --- a/tests/ovn.at
>>>>> +++ b/tests/ovn.at
>>>>> @@ -23109,7 +23109,7 @@ AT_CHECK([
>>>>>      for hv in 1 2; do
>>>>>          grep table=15 hv${hv}flows | \
>>>>>          grep "priority=100" | \
>>>>> -        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>>>> +        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
>>>>>  
>>>>>          grep table=22 hv${hv}flows | \
>>>>>          grep "priority=200" | \
>>>>> diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
>>>>> index c8fa6f03f..b742a2cb9 100644
>>>>> --- a/tests/system-common-macros.at
>>>>> +++ b/tests/system-common-macros.at
>>>>> @@ -330,3 +330,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP],
>>>>>  # OVS_CHECK_CT_CLEAR()
>>>>>  m4_define([OVS_CHECK_CT_CLEAR],
>>>>>      [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
>>>>> +
>>>>> +# OVS_CHECK_CT_ZERO_SNAT()
>>>>> +m4_define([OVS_CHECK_CT_ZERO_SNAT],
>>>>> +    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
>>>>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>>>>> index 310bd3d5a..61d6bfc86 100644
>>>>> --- a/tests/system-ovn.at
>>>>> +++ b/tests/system-ovn.at
>>>>> @@ -5296,6 +5296,144 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>>>>>  AT_CLEANUP
>>>>>  ])
>>>>>  
>>>>> +OVN_FOR_EACH_NORTHD([
>>>>> +AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
>>>>> +AT_KEYWORDS([ovnlb])
>>>>> +
>>>>> +CHECK_CONNTRACK()
>>>>> +CHECK_CONNTRACK_NAT()
>>>>> +ovn_start
>>>>> +OVS_TRAFFIC_VSWITCHD_START()
>>>>> +OVS_CHECK_CT_ZERO_SNAT()
>>>>> +ADD_BR([br-int])
>>>>> +
>>>>> +# Set external-ids in br-int needed for ovn-controller
>>>>> +ovs-vsctl \
>>>>> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
>>>>> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
>>>>> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
>>>>> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
>>>>> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
>>>>> +
>>>>> +# Start ovn-controller
>>>>> +start_daemon ovn-controller
>>>>> +
>>>>> +# Logical network:
>>>>> +# 1 logical switch connetected to one logical router.
>>>>> +# 2 VMs, one used as backend for a load balancer.
>>>>> +
>>>>> +check ovn-nbctl                                                  \
>>>>> +    -- lr-add rtr                                                \
>>>>> +    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
>>>>> +    -- ls-add ls                                                 \
>>>>> +    -- lsp-add ls ls-rtr                                         \
>>>>> +    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
>>>>> +    -- lsp-set-type ls-rtr router                                \
>>>>> +    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
>>>>> +    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
>>>>> +    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
>>>>> +    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
>>>>> +    -- ls-lb-add ls lb-test
>>>>
>>>> Will this work for a DNAT entry instead of a load-balancer? As you know,
>>>> the flows are slightly different. Will this work when combined with
>>>> forcing SNAT? Do we need to check this?
>>>
>>> We already have system tests that test DNAT and load balancer force
>>> SNAT.  I thought we're already covered by those.
>>
>> In those cases we don't have tests in which we are reusing the source
>> port as we are in this test. What I am wondering is that when we have
>> DNAT entries or force SNAT, this will generate different Openflow rules
>> which may cause the OVS datapath flows to look slightly different which
>> may cause differences in behaviour not covered by this test.
>>
>> Please correct me if I am wrong or if you think it is unnecessary. I am
>> a little paranoid about the OVN DNAT/SNAT/LB router code as I was
>> recently working on a bug in this area and I think more tests in this
>> area may be useful.
> 
> Sure, more tests definitely don't hurt.  I'll add some more in v2.
> 

Thanks for the request to add more tests for router load balancers and
DNAT!  While doing that I realized that this exact scenario is broken
because non-NAT-ed connections are never committed to conntrack.  So
all-zero-snat alone would not help in that scenario.

However, based on our internal discussions, the fix for
https://bugzilla.redhat.com/show_bug.cgi?id=1956740 will be exactly
that: commit non-natted sessions.

It would be great if you could fold in the patch below when you post
your fix;  it adds tests for these scenarios.

[...]

>>>>
>>>>> +
>>>>> +# Check conntrack.
>>>>> +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
>>>>> +
>>>>> +# Start IPv4 TCP connection to backend IP from vm2 which would require
>>>>> +# additional source port translation to avoid a tuple conflict.
>>>>> +NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])
>>>>
>>>> Should there be another check to ensure that the conntrack entries are
>>>> setup as expected?
>>>>
>>>
>>> I initially wanted to add one but there are differences in the way the
>>> TCP state machine is implemented in userspace conntrack vs kernel
>>> conntrack and I couldn't come up with something that would pass on both.
>>>
>>> I'll try some more though.
>>
>> I think it would be useful. As I mentioned above I am a bit paranoid
>> about OVN DNAT/LB/SNAT :)
> 
> :)
> 

I updated the tests, v2 coming soon.

Thanks,
Dumitru

From: Dumitru Ceara <dceara@redhat.com>
Date: Wed, 9 Jun 2021 11:37:36 +0200
Subject: [PATCH ovn] system-ovn.at: Add DNAT/no-NAT router load balancer and
 FIP tests.

Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 tests/system-ovn.at | 397 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 397 insertions(+)

diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index 1c11f9bab..dcb0ac6ab 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -5452,6 +5452,403 @@ tcp,orig=(src=4242::3,dst=4242::2,sport=<cleared>,dport=<cleared>),reply=(src=42
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- router load-balancer tuple conflict IPv4])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# vm1 --- LS --- RTR --- LS-EXT --- vm-ext
+# 1 load balancer configured on RTR with vm1 as backend.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lrp-add rtr rtr-ls-ext 00:00:00:00:02:00 43.43.43.1/24    \
+    -- ls-add ls-ext                                             \
+    -- lsp-add ls-ext ls-ext-rtr                                 \
+    -- lsp-set-addresses ls-ext-rtr 00:00:00:00:02:00            \
+    -- lsp-set-type ls-ext-rtr router                            \
+    -- lsp-set-options ls-ext-rtr router-port=rtr-ls-ext         \
+    -- lsp-add ls-ext vm-ext                                     \
+    -- lsp-set-addresses vm-ext 00:00:00:00:02:01                \
+    -- set logical_router rtr options:chassis=hv1                \
+    -- set logical_router rtr options:lb_force_snat_ip=router_ip \
+    -- lb-add lb-test-rtr 66.66.66.66:666 42.42.42.2:4242 tcp    \
+    -- lr-lb-add rtr lb-test-rtr
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
+
+ADD_NAMESPACES(vm-ext)
+ADD_VETH(vm-ext, vm-ext, br-int, "43.43.43.2/24", "00:00:00:00:02:01",
+         "43.43.43.1")
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+check ovn-nbctl --wait=hv sync
+
+# Start IPv4 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc1.pid])
+
+AS_BOX([with lb_force_snat])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm-ext], [nc 66.66.66.66 666 -p 2000 -z])
+
+# Start IPv4 TCP connection to VIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 66.66.66.66 666 -p 2001], [nc2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(66.66.66.66) | wc -l` = 1])
+
+# Start IPv4 TCP connection to backend IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 42.42.42.2 4242 -p 2001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(66.66.66.66) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=43.43.43.2,dst=66.66.66.66,sport=<cleared>,dport=<cleared>),reply=(src=42.42.42.2,dst=43.43.43.2,sport=<cleared>,dport=<cleared>),zone=<cleared>,labels=0x2,protoinfo=(state=<cleared>)
+])
+
+AS_BOX([without lb_force_snat])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+check ovn-nbctl remove logical_router rtr options lb_force_snat_ip
+
+# Wait for ovn-controller to catch up.
+check ovn-nbctl --wait=hv sync
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm-ext], [nc 66.66.66.66 666 -p 4000 -z])
+
+# Start IPv4 TCP connection to VIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 66.66.66.66 666 -p 4001], [nc3.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(66.66.66.66) | wc -l` = 1])
+
+# Start IPv4 TCP connection to backend IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 42.42.42.2 4242 -p 4001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(66.66.66.66) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=43.43.43.2,dst=66.66.66.66,sport=<cleared>,dport=<cleared>),reply=(src=42.42.42.2,dst=43.43.43.2,sport=<cleared>,dport=<cleared>),zone=<cleared>,labels=0x2,protoinfo=(state=<cleared>)
+])
+
+AT_CLEANUP
+])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- router load-balancer tuple conflict IPv6])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# vm1 --- LS --- RTR --- LS-EXT --- vm-ext
+# 1 load balancer configured on RTR with vm1 as backend.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 4242::1/64           \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lrp-add rtr rtr-ls-ext 00:00:00:00:02:00 4343::1/64       \
+    -- ls-add ls-ext                                             \
+    -- lsp-add ls-ext ls-ext-rtr                                 \
+    -- lsp-set-addresses ls-ext-rtr 00:00:00:00:02:00            \
+    -- lsp-set-type ls-ext-rtr router                            \
+    -- lsp-set-options ls-ext-rtr router-port=rtr-ls-ext         \
+    -- lsp-add ls-ext vm-ext                                     \
+    -- lsp-set-addresses vm-ext 00:00:00:00:02:01                \
+    -- set logical_router rtr options:chassis=hv1                \
+    -- set logical_router rtr options:lb_force_snat_ip=router_ip \
+    -- lb-add lb-test-rtr [[6666::1]]:666 [[4242::2]]:4242 tcp   \
+    -- lr-lb-add rtr lb-test-rtr
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "4242::2/64", "00:00:00:00:00:01", "4242::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm1 ip a | grep 4242::2 | grep tentative)" = ""])
+
+ADD_NAMESPACES(vm-ext)
+ADD_VETH(vm-ext, vm-ext, br-int, "4343::2/64", "00:00:00:00:02:01",
+         "4343::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm-ext ip a | grep 4343::2 | grep tentative)" = ""])
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+check ovn-nbctl --wait=hv sync
+
+# Start IPv6 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 4242::2 4242], [nc1.pid])
+
+AS_BOX([with lb_force_snat])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm-ext], [nc 6666::1 666 -p 2000 -z])
+
+# Start IPv6 TCP connection to VIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 6666::1 666 -p 2001], [nc2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(6666::1) | wc -l` = 1])
+
+# Start IPv6 TCP connection to backend IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 4242::2 4242 -p 2001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(6666::1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=4343::2,dst=6666::1,sport=<cleared>,dport=<cleared>),reply=(src=4242::2,dst=4343::2,sport=<cleared>,dport=<cleared>),zone=<cleared>,labels=0x2,protoinfo=(state=<cleared>)
+])
+
+AS_BOX([without lb_force_snat])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+check ovn-nbctl remove logical_router rtr options lb_force_snat_ip
+
+# Wait for ovn-controller to catch up.
+check ovn-nbctl --wait=hv sync
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm-ext], [nc 6666::1 666 -p 2000 -z])
+
+# Start IPv6 TCP connection to VIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 6666::1 666 -p 2001], [nc2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(6666::1) | wc -l` = 1])
+
+# Start IPv6 TCP connection to backend IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 4242::2 4242 -p 2001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(6666::1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=4343::2,dst=6666::1,sport=<cleared>,dport=<cleared>),reply=(src=4242::2,dst=4343::2,sport=<cleared>,dport=<cleared>),zone=<cleared>,labels=0x2,protoinfo=(state=<cleared>)
+])
+
+AT_CLEANUP
+])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- router DNAt and no-nat tuple conflict IPv4])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# vm1 --- LS --- RTR --- LS-EXT --- vm-ext
+# 1 DNAT-and-SNAT entry configured on RTR for vm1.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lrp-add rtr rtr-ls-ext 00:00:00:00:02:00 43.43.43.1/24    \
+    -- ls-add ls-ext                                             \
+    -- lsp-add ls-ext ls-ext-rtr                                 \
+    -- lsp-set-addresses ls-ext-rtr 00:00:00:00:02:00            \
+    -- lsp-set-type ls-ext-rtr router                            \
+    -- lsp-set-options ls-ext-rtr router-port=rtr-ls-ext         \
+    -- lsp-add ls-ext vm-ext                                     \
+    -- lsp-set-addresses vm-ext 00:00:00:00:02:01                \
+    -- set logical_router rtr options:chassis=hv1                \
+    -- lr-nat-add rtr dnat_and_snat 66.66.66.66 42.42.42.2 vm1 00:00:01:00:00:01
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
+
+ADD_NAMESPACES(vm-ext)
+ADD_VETH(vm-ext, vm-ext, br-int, "43.43.43.2/24", "00:00:00:00:02:01",
+         "43.43.43.1")
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+check ovn-nbctl --wait=hv sync
+
+# Start IPv4 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc1.pid])
+
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Make sure connecting to the FIP works.
+NS_CHECK_EXEC([vm-ext], [nc 66.66.66.66 4242 -p 2000 -z])
+
+# Start IPv4 TCP connection to FIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 66.66.66.66 4242 -p 2001], [nc2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(66.66.66.66) | wc -l` = 1])
+
+# Start IPv4 TCP connection to vm IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 42.42.42.2 4242 -p 2001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(66.66.66.66) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=43.43.43.2,dst=66.66.66.66,sport=<cleared>,dport=<cleared>),reply=(src=42.42.42.2,dst=43.43.43.2,sport=<cleared>,dport=<cleared>),zone=<cleared>,protoinfo=(state=<cleared>)
+])
+
+AT_CLEANUP
+])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- router DNAt and no-nat tuple conflict IPv6])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# vm1 --- LS --- RTR --- LS-EXT --- vm-ext
+# 1 DNAT-and-SNAT entry configured on RTR for vm1.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 4242::1/64           \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lrp-add rtr rtr-ls-ext 00:00:00:00:02:00 4343::1/64       \
+    -- ls-add ls-ext                                             \
+    -- lsp-add ls-ext ls-ext-rtr                                 \
+    -- lsp-set-addresses ls-ext-rtr 00:00:00:00:02:00            \
+    -- lsp-set-type ls-ext-rtr router                            \
+    -- lsp-set-options ls-ext-rtr router-port=rtr-ls-ext         \
+    -- lsp-add ls-ext vm-ext                                     \
+    -- lsp-set-addresses vm-ext 00:00:00:00:02:01                \
+    -- set logical_router rtr options:chassis=hv1                \
+    -- lr-nat-add rtr dnat_and_snat 6666::1 4242::2 vm1 00:00:01:00:00:01
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "4242::2/64", "00:00:00:00:00:01", "4242::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm1 ip a | grep 4242::2 | grep tentative)" = ""])
+
+ADD_NAMESPACES(vm-ext)
+ADD_VETH(vm-ext, vm-ext, br-int, "4343::2/64", "00:00:00:00:02:01", "4343::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm-ext ip a | grep 4343::2 | grep tentative)" = ""])
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+check ovn-nbctl --wait=hv sync
+
+# Start IPv6 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 4242::2 4242], [nc1.pid])
+
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Make sure connecting to the FIP works.
+NS_CHECK_EXEC([vm-ext], [nc 6666::1 4242 -p 2000 -z])
+
+# Start IPv6 TCP connection to FIP from vm-ext.
+NETNS_DAEMONIZE([vm-ext], [nc 6666::1 4242 -p 2001], [nc2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-conntrack | \
+FORMAT_CT(6666::1) | wc -l` = 1])
+
+# Start IPv6 TCP connection to vm IP using the same source port.
+NS_CHECK_EXEC([vm-ext], [nc 4242::2 4242 -p 2001 -z])
+
+# Check conntrack.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(6666::1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/'], [0], [dnl
+tcp,orig=(src=4343::2,dst=6666::1,sport=<cleared>,dport=<cleared>),reply=(src=4242::2,dst=4343::2,sport=<cleared>,dport=<cleared>),zone=<cleared>,protoinfo=(state=<cleared>)
+])
+
+AT_CLEANUP
+])
+
 # When a lport is released on a chassis, ovn-controller was
 # not clearing some of the flowss in the table 33 leading
 # to packet drops if ct() is hit.
diff mbox series

Patch

diff --git a/controller/lflow.c b/controller/lflow.c
index 680b8cca1..af6a237d1 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -582,6 +582,7 @@  add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
         .is_switch = datapath_is_switch(dp),
         .group_table = l_ctx_out->group_table,
         .meter_table = l_ctx_out->meter_table,
+        .dp_features = ovs_feature_support_get(),
         .lflow_uuid = lflow->header_.uuid,
 
         .pipeline = ingress ? OVNACT_P_INGRESS : OVNACT_P_EGRESS,
diff --git a/include/ovn/actions.h b/include/ovn/actions.h
index 040213177..35983498b 100644
--- a/include/ovn/actions.h
+++ b/include/ovn/actions.h
@@ -25,6 +25,7 @@ 
 #include "openvswitch/hmap.h"
 #include "openvswitch/uuid.h"
 #include "util.h"
+#include "ovn/features.h"
 
 struct expr;
 struct lexer;
@@ -756,6 +757,9 @@  struct ovnact_encode_params {
     /* A struct to figure out the meter_id for meter actions. */
     struct ovn_extend_table *meter_table;
 
+    /* OVS datapath supported features. */
+    enum ovs_feature_support dp_features;
+
     /* The logical flow uuid that drove this action. */
     struct uuid lflow_uuid;
 
diff --git a/lib/actions.c b/lib/actions.c
index b3433f49e..c8c2443ce 100644
--- a/lib/actions.c
+++ b/lib/actions.c
@@ -732,7 +732,7 @@  format_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc, struct ds *s)
 
 static void
 encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
-                    const struct ovnact_encode_params *ep OVS_UNUSED,
+                    const struct ovnact_encode_params *ep,
                     struct ofpbuf *ofpacts)
 {
     struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
@@ -742,6 +742,21 @@  encode_CT_COMMIT_V1(const struct ovnact_ct_commit_v1 *cc,
     ct->zone_src.ofs = 0;
     ct->zone_src.n_bits = 16;
 
+    /* If the datapath supports all-zero SNAT then use it to avoid tuple
+     * collisions at commit time between NATed and firewalled-only sessions.
+     */
+    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
+        size_t nat_offset = ofpacts->size;
+        ofpbuf_pull(ofpacts, nat_offset);
+
+        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
+        nat->flags = 0;
+        nat->range_af = AF_UNSPEC;
+        nat->flags |= NX_NAT_F_SRC;
+        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
+        ct = ofpacts->header;
+    }
+
     size_t set_field_offset = ofpacts->size;
     ofpbuf_pull(ofpacts, set_field_offset);
 
@@ -780,7 +795,7 @@  format_CT_COMMIT_V2(const struct ovnact_nest *on, struct ds *s)
 
 static void
 encode_CT_COMMIT_V2(const struct ovnact_nest *on,
-                    const struct ovnact_encode_params *ep OVS_UNUSED,
+                    const struct ovnact_encode_params *ep,
                     struct ofpbuf *ofpacts)
 {
     struct ofpact_conntrack *ct = ofpact_put_CT(ofpacts);
@@ -792,6 +807,21 @@  encode_CT_COMMIT_V2(const struct ovnact_nest *on,
     ct->zone_src.ofs = 0;
     ct->zone_src.n_bits = 16;
 
+    /* If the datapath supports all-zero SNAT then use it to avoid tuple
+     * collisions at commit time between NATed and firewalled-only sessions.
+     */
+    if (ep->dp_features & OVS_CT_ZERO_SNAT_SUPPORT) {
+        size_t nat_offset = ofpacts->size;
+        ofpbuf_pull(ofpacts, nat_offset);
+
+        struct ofpact_nat *nat = ofpact_put_NAT(ofpacts);
+        nat->flags = 0;
+        nat->range_af = AF_UNSPEC;
+        nat->flags |= NX_NAT_F_SRC;
+        ofpacts->header = ofpbuf_push_uninit(ofpacts, nat_offset);
+        ct = ofpacts->header;
+    }
+
     size_t set_field_offset = ofpacts->size;
     ofpbuf_pull(ofpacts, set_field_offset);
 
diff --git a/tests/ovn.at b/tests/ovn.at
index f26894ce4..638aa5010 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -23109,7 +23109,7 @@  AT_CHECK([
     for hv in 1 2; do
         grep table=15 hv${hv}flows | \
         grep "priority=100" | \
-        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
+        grep -c "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],.*exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
 
         grep table=22 hv${hv}flows | \
         grep "priority=200" | \
diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
index c8fa6f03f..b742a2cb9 100644
--- a/tests/system-common-macros.at
+++ b/tests/system-common-macros.at
@@ -330,3 +330,7 @@  m4_define([OVS_CHECK_IPROUTE_ENCAP],
 # OVS_CHECK_CT_CLEAR()
 m4_define([OVS_CHECK_CT_CLEAR],
     [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])])
+
+# OVS_CHECK_CT_ZERO_SNAT()
+m4_define([OVS_CHECK_CT_ZERO_SNAT],
+    [AT_SKIP_IF([! grep -q "Datapath supports ct_zero_snat" ovs-vswitchd.log])]))
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index 310bd3d5a..61d6bfc86 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -5296,6 +5296,144 @@  OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv4])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+OVS_CHECK_CT_ZERO_SNAT()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# 1 logical switch connetected to one logical router.
+# 2 VMs, one used as backend for a load balancer.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 42.42.42.1/24        \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
+    -- lb-add lb-test 66.66.66.66:666 42.42.42.2:4242 tcp        \
+    -- ls-lb-add ls lb-test
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "42.42.42.2/24", "00:00:00:00:00:01", "42.42.42.1")
+
+ADD_NAMESPACES(vm2)
+ADD_VETH(vm2, vm2, br-int, "42.42.42.3/24", "00:00:00:00:00:02", "42.42.42.1")
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+ovn-nbctl --wait=hv sync
+
+# Start IPv4 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 42.42.42.2 4242], [nc-vm1.pid])
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm2], [nc 66.66.66.66 666 -p 2000 -z])
+
+# Start IPv4 TCP connection to VIP from vm2.
+NETNS_DAEMONIZE([vm2], [nc 66.66.66.66 666 -p 2001], [nc1-vm2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(42.42.42.2)])
+
+# Start IPv4 TCP connection to backend IP from vm2 which would require
+# additional source port translation to avoid a tuple conflict.
+NS_CHECK_EXEC([vm2], [nc 42.42.42.2 4242 -p 2001 -z])
+
+AT_CLEANUP
+])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- load-balancer and firewall tuple conflict IPv6])
+AT_KEYWORDS([ovnlb])
+
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+OVS_CHECK_CT_ZERO_SNAT()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# 1 logical switch connetected to one logical router.
+# 2 VMs, one used as backend for a load balancer.
+
+check ovn-nbctl                                                  \
+    -- lr-add rtr                                                \
+    -- lrp-add rtr rtr-ls 00:00:00:00:01:00 4242::1/64           \
+    -- ls-add ls                                                 \
+    -- lsp-add ls ls-rtr                                         \
+    -- lsp-set-addresses ls-rtr 00:00:00:00:01:00                \
+    -- lsp-set-type ls-rtr router                                \
+    -- lsp-set-options ls-rtr router-port=rtr-ls                 \
+    -- lsp-add ls vm1 -- lsp-set-addresses vm1 00:00:00:00:00:01 \
+    -- lsp-add ls vm2 -- lsp-set-addresses vm2 00:00:00:00:00:02 \
+    -- lb-add lb-test [[6666::1]]:666 [[4242::2]]:4242 tcp       \
+    -- ls-lb-add ls lb-test
+
+ADD_NAMESPACES(vm1)
+ADD_VETH(vm1, vm1, br-int, "4242::2/64", "00:00:00:00:00:01", "4242::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm1 ip a | grep 4242::2 | grep tentative)" = ""])
+
+ADD_NAMESPACES(vm2)
+ADD_VETH(vm2, vm2, br-int, "4242::3/64", "00:00:00:00:00:02", "4242::1")
+OVS_WAIT_UNTIL([test "$(ip netns exec vm2 ip a | grep 4242::3 | grep tentative)" = ""])
+
+# Wait for ovn-controller to catch up.
+wait_for_ports_up
+ovn-nbctl --wait=hv sync
+
+# Start IPv6 TCP server on vm1.
+NETNS_DAEMONIZE([vm1], [nc -k -l 4242::2 4242], [nc-vm1.pid])
+
+# Make sure connecting to the VIP works.
+NS_CHECK_EXEC([vm2], [nc 6666::1 666 -p 2000 -z])
+
+# Start IPv6 TCP connection to VIP from vm2.
+NETNS_DAEMONIZE([vm2], [nc 6666::1 666 -p 2001], [nc1-vm2.pid])
+
+# Check conntrack.
+OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-conntrack | grep ESTABLISHED | FORMAT_CT(4242::2)])
+
+# Start IPv6 TCP connection to backend IP from vm2 which would require
+# additional source port translation to avoid a tuple conflict.
+NS_CHECK_EXEC([vm2], [nc 4242::2 4242 -p 2001 -z])
+
+AT_CLEANUP
+])
+
 # When a lport is released on a chassis, ovn-controller was
 # not clearing some of the flowss in the table 33 leading
 # to packet drops if ct() is hit.