diff mbox series

[ovs-dev,v2,9/9] tests: ipsec: Check that nodes can ping each other in the NxN test.

Message ID 20241030135043.3139987-10-i.maximets@ovn.org
State Superseded
Headers show
Series ipsec: Resiliency to Libreswan failures. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Ilya Maximets Oct. 30, 2024, 1:50 p.m. UTC
Expand the NxN test with the network connectivity check between all the
nodes.  Unfortunately, we can't really run this test with Libreswan 4.x,
since, due to internal issues in these versions, we are getting into
states where everything is loaded and active, but no traffic can pass.
This is an internal issue in Libreswan that we can't workaround from
the outside.  So, the fix is required in Libreswan itself.  4.5 and
earlier versions seem to not be affected by this problem, at least not
severely affected, but it's easier to just cut off all the 4.x versions
from the test.

3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with
this test.

Test is relatively long, but it is very valuable, IMO.  Besides
stressing ovs-monitor-ipsec with various failure and asynchronous
connection establishment conditions, which are important for OVS, it
also was used to reproduce and fix several bugs in Libreswan 4.x.
Unfortunately, not all the issues are understood and fixed yet.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 tests/system-ipsec.at | 82 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 8 deletions(-)

Comments

Eelco Chaudron Oct. 31, 2024, 11:46 a.m. UTC | #1
On 30 Oct 2024, at 14:50, Ilya Maximets wrote:

> Expand the NxN test with the network connectivity check between all the
> nodes.  Unfortunately, we can't really run this test with Libreswan 4.x,
> since, due to internal issues in these versions, we are getting into
> states where everything is loaded and active, but no traffic can pass.
> This is an internal issue in Libreswan that we can't workaround from
> the outside.  So, the fix is required in Libreswan itself.  4.5 and
> earlier versions seem to not be affected by this problem, at least not
> severely affected, but it's easier to just cut off all the 4.x versions
> from the test.
>
> 3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with
> this test.
>
> Test is relatively long, but it is very valuable, IMO.  Besides
> stressing ovs-monitor-ipsec with various failure and asynchronous
> connection establishment conditions, which are important for OVS, it
> also was used to reproduce and fix several bugs in Libreswan 4.x.
> Unfortunately, not all the issues are understood and fixed yet.

I do not like the fact that now the inclusion or not for the traffic test is not visible. I rather split up the test and get a “skipped” indication for the traffic part.

WDYT?

//Eelco

> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
> ---
>  tests/system-ipsec.at | 82 ++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 74 insertions(+), 8 deletions(-)
>
> diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at
> index 5aa67bf1d..8e6733371 100644
> --- a/tests/system-ipsec.at
> +++ b/tests/system-ipsec.at
> @@ -71,7 +71,9 @@ m4_define([IPSEC_ADD_NODE],
>    on_exit "kill `cat $ovs_base/$1/ovs-monitor-ipsec.pid`"
>
>    dnl Set up OVS bridge
> -  NS_EXEC([$1], [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec])]
> +  NS_CHECK_EXEC([$1],
> +    [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec \
> +               -- set-controller br-ipsec punix:$ovs_base/br-ipsec.$1.mgmt])]
>  )
>  m4_define([IPSEC_ADD_NODE_LEFT], [IPSEC_ADD_NODE(left, p0, $1, $2)])
>  m4_define([IPSEC_ADD_NODE_RIGHT], [IPSEC_ADD_NODE(right, p1, $1, $2)])
> @@ -429,7 +431,8 @@ m4_for([id], [1], NODES, [1], [
>                  self-sign node-id], [0], [stdout])
>    AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \
>        other_config:certificate=${ovs_base}/node-id-cert.pem \
> -      other_config:private_key=${ovs_base}/node-id-privkey.pem),
> +      other_config:private_key=${ovs_base}/node-id-privkey.pem \
> +      -- set bridge br-ipsec other-config:hwaddr=f2:ff:00:00:00:id),
>        [0], [ignore], [ignore])
>    on_exit "ipsec --rundir $ovs_base/node-id status > $ovs_base/node-id/status"
>  ])
> @@ -445,11 +448,18 @@ m4_for([LEFT], [1], NODES, [1], [
>      fi
>  ])])
>
> +dnl These are not necessary, but nice to have in the test log in
> +dnl order to spot pluto failures during the test.
> +on_exit "grep -E 'timed out|outdated|half-loaded|defunct' \
> +            $ovs_base/node-*/ovs-monitor-ipsec.log"
> +on_exit "grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log"
> +
>  m4_define([WAIT_FOR_LOADED_CONNS], [
>    m4_for([id], [1], NODES, [1], [
>      echo "================== node-id ========================="
>      iterations=0
>      loaded=0
> +    active=0
>      dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take
>      dnl much longer than a default timeout.  The default retransmit timeout
>      dnl for pluto is 60 seconds.  Also, we need to make sure pluto didn't
> @@ -463,8 +473,11 @@ m4_define([WAIT_FOR_LOADED_CONNS], [
>          START_PLUTO([node-id])
>        else
>          loaded=$(IPSEC_STATUS_LOADED(node-id))
> +        m4_if([$1], [active],
> +              [active=$(IPSEC_STATUS_ACTIVE(node-id))], [active=$loaded])
>        fi
> -      if test "$loaded" -ne $(( (NODES - 1) * 2 )); then
> +      if test "$loaded" -ne "$(( (NODES - 1) * 2 ))" -o \
> +              "$loaded" -ne "$active"; then
>          sleep 3
>        else
>          break
> @@ -505,11 +518,64 @@ OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
>  dnl Wait for all the connections to be loaded back.
>  WAIT_FOR_LOADED_CONNS()
>
> -dnl These are not necessary, but nice to have in the test log in
> -dnl order to spot pluto failures during the test.
> -grep -E 'timed out|outdated|half-loaded|defunct' \
> -            $ovs_base/node-*/ovs-monitor-ipsec.log
> -grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log
> +dnl Next section will check connectivity between all the nodes.
> +dnl Different versions of Libreswan 4.x have issues where connections
> +dnl are not being correctly established or never become active in a
> +dnl way that can not be mitigated from ovs-monitor-ipsec or the test.
> +dnl So, only checking connectivity for Libreswan 3- or 5+.
> +if ! (ipsec --version 2>&1 | grep -q 'Libreswan 4\.'); then
> +  dnl Turn off IPv6 and add static ARP entries for all namespaces to avoid
> +  dnl any broadcast / multicast traffic that would otherwise be multiplied
> +  dnl by each node creating a traffic storm.  Add specific OpenFlow rules
> +  dnl to forward traffic to exact destinations without any MAC learning.
> +  m4_for([LEFT], [1], NODES, [1], [
> +    NS_CHECK_EXEC([node-LEFT], [sysctl -w net.ipv6.conf.all.disable_ipv6=1],
> +                  [0], [ignore])
> +    AT_CHECK([ovs-ofctl del-flows unix:$ovs_base/br-ipsec.node-LEFT.mgmt])
> +    AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
> +                    "dl_dst=f2:ff:00:00:00:LEFT actions=LOCAL"])
> +    m4_for([RIGHT], [1], NODES, [1], [
> +      if test LEFT -ne RIGHT; then
> +        NS_CHECK_EXEC([node-LEFT],
> +          [ip neigh add 192.0.0.RIGHT lladdr f2:ff:00:00:00:RIGHT dev br-ipsec])
> +        AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
> +                    "dl_dst=f2:ff:00:00:00:RIGHT actions=tun-RIGHT"])
> +      fi
> +    ])
> +  ])
> +
> +  dnl Bring up and add IP addresses for br-ipsec interface.
> +  m4_for([id], [1], NODES, [1], [
> +    echo "================== node-id ========================="
> +    NS_CHECK_EXEC([node-id], [ip addr add 192.0.0.id/24 dev br-ipsec])
> +    NS_CHECK_EXEC([node-id], [ip link set dev br-ipsec up])
> +  ])
> +
> +  dnl Wait for all the connections to be loaded and active.  In case one of
> +  dnl the pluto processes crashed some of the connections may never become
> +  dnl active.  But we did run this loop with a pluto reviving logic twice
> +  dnl already, so the chances for pluto to be down here are much lower.
> +  WAIT_FOR_LOADED_CONNS([active])
> +
> +  dnl Check the full mesh ping.
> +  m4_for([LEFT], [1], NODES, [1], [
> +    m4_for([RIGHT], [1], NODES, [1], [
> +      if test LEFT -ne RIGHT; then
> +        echo "====== ping: node-LEFT --> node-RIGHT =========="
> +        dnl Ping without checking in case connection will recover after the
> +        dnl first packet.
> +        NS_CHECK_EXEC([node-LEFT],
> +                      [ping -q -c 1 -W 2 192.0.0.RIGHT | FORMAT_PING],
> +                      [ignore], [stdout])
> +        dnl Now check.  If this one fails, there is no actual connectivity.
> +        NS_CHECK_EXEC([node-LEFT],
> +                      [ping -q -c 3 -i 0.1 -W 2 192.0.0.RIGHT | FORMAT_PING],
> +                      [0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +      fi
> +  ])])
> +fi
>
>  OVS_TRAFFIC_VSWITCHD_STOP()
>  AT_CLEANUP
> -- 
> 2.46.0
Ilya Maximets Oct. 31, 2024, 4:42 p.m. UTC | #2
On 10/31/24 12:46, Eelco Chaudron wrote:
> 
> 
> On 30 Oct 2024, at 14:50, Ilya Maximets wrote:
> 
>> Expand the NxN test with the network connectivity check between all the
>> nodes.  Unfortunately, we can't really run this test with Libreswan 4.x,
>> since, due to internal issues in these versions, we are getting into
>> states where everything is loaded and active, but no traffic can pass.
>> This is an internal issue in Libreswan that we can't workaround from
>> the outside.  So, the fix is required in Libreswan itself.  4.5 and
>> earlier versions seem to not be affected by this problem, at least not
>> severely affected, but it's easier to just cut off all the 4.x versions
>> from the test.
>>
>> 3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with
>> this test.
>>
>> Test is relatively long, but it is very valuable, IMO.  Besides
>> stressing ovs-monitor-ipsec with various failure and asynchronous
>> connection establishment conditions, which are important for OVS, it
>> also was used to reproduce and fix several bugs in Libreswan 4.x.
>> Unfortunately, not all the issues are understood and fixed yet.
> 
> I do not like the fact that now the inclusion or not for the traffic test
> is not visible. I rather split up the test and get a “skipped” indication
> for the traffic part.
> 
> WDYT?

Yeah, I'm not a big fan of this either.  We have a few options:

1. Copy the test and only test the configuration in one and
   ping in the other.

2. Put the test into a macro and call it twice with and without
   ping, so the one with ping can be skipped.

3. Add SKIP_IF() check in the middle of the test.

The first option will cause some code duplication, which doesn't
sound great.  The second one is OK, but it will obscure line numbers
when something inside the macro fails, which is not very friendly
while debugging the test.

The third option will solve the code duplication and we'll also
not run the configuration check twice saving the time.  We will
still run the configuration part with Libreswan 4, even if the
test will be skipped afterwards, so the test will fail if the
configuration fails.  This is not very straightforward, but at
least the test will actually run on all systems and it will be
clear that it wasn't run in full on systems with libreswan 4,
so there will be no ambiguity.

WDYT?

Best regards, Ilya Maximets.
Eelco Chaudron Oct. 31, 2024, 5:05 p.m. UTC | #3
On 31 Oct 2024, at 17:42, Ilya Maximets wrote:

> On 10/31/24 12:46, Eelco Chaudron wrote:
>>
>>
>> On 30 Oct 2024, at 14:50, Ilya Maximets wrote:
>>
>>> Expand the NxN test with the network connectivity check between all the
>>> nodes.  Unfortunately, we can't really run this test with Libreswan 4.x,
>>> since, due to internal issues in these versions, we are getting into
>>> states where everything is loaded and active, but no traffic can pass.
>>> This is an internal issue in Libreswan that we can't workaround from
>>> the outside.  So, the fix is required in Libreswan itself.  4.5 and
>>> earlier versions seem to not be affected by this problem, at least not
>>> severely affected, but it's easier to just cut off all the 4.x versions
>>> from the test.
>>>
>>> 3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with
>>> this test.
>>>
>>> Test is relatively long, but it is very valuable, IMO.  Besides
>>> stressing ovs-monitor-ipsec with various failure and asynchronous
>>> connection establishment conditions, which are important for OVS, it
>>> also was used to reproduce and fix several bugs in Libreswan 4.x.
>>> Unfortunately, not all the issues are understood and fixed yet.
>>
>> I do not like the fact that now the inclusion or not for the traffic test
>> is not visible. I rather split up the test and get a “skipped” indication
>> for the traffic part.
>>
>> WDYT?
>
> Yeah, I'm not a big fan of this either.  We have a few options:
>
> 1. Copy the test and only test the configuration in one and
>    ping in the other.
>
> 2. Put the test into a macro and call it twice with and without
>    ping, so the one with ping can be skipped.
>
> 3. Add SKIP_IF() check in the middle of the test.
>
> The first option will cause some code duplication, which doesn't
> sound great.  The second one is OK, but it will obscure line numbers
> when something inside the macro fails, which is not very friendly
> while debugging the test.
>
> The third option will solve the code duplication and we'll also
> not run the configuration check twice saving the time.  We will
> still run the configuration part with Libreswan 4, even if the
> test will be skipped afterwards, so the test will fail if the
> configuration fails.  This is not very straightforward, but at
> least the test will actually run on all systems and it will be
> clear that it wasn't run in full on systems with libreswan 4,
> so there will be no ambiguity.
>
> WDYT?

Option 3 appears to be the most suitable choice for me.


> Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at
index 5aa67bf1d..8e6733371 100644
--- a/tests/system-ipsec.at
+++ b/tests/system-ipsec.at
@@ -71,7 +71,9 @@  m4_define([IPSEC_ADD_NODE],
   on_exit "kill `cat $ovs_base/$1/ovs-monitor-ipsec.pid`"
 
   dnl Set up OVS bridge
-  NS_EXEC([$1], [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec])]
+  NS_CHECK_EXEC([$1],
+    [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec \
+               -- set-controller br-ipsec punix:$ovs_base/br-ipsec.$1.mgmt])]
 )
 m4_define([IPSEC_ADD_NODE_LEFT], [IPSEC_ADD_NODE(left, p0, $1, $2)])
 m4_define([IPSEC_ADD_NODE_RIGHT], [IPSEC_ADD_NODE(right, p1, $1, $2)])
@@ -429,7 +431,8 @@  m4_for([id], [1], NODES, [1], [
                 self-sign node-id], [0], [stdout])
   AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \
       other_config:certificate=${ovs_base}/node-id-cert.pem \
-      other_config:private_key=${ovs_base}/node-id-privkey.pem),
+      other_config:private_key=${ovs_base}/node-id-privkey.pem \
+      -- set bridge br-ipsec other-config:hwaddr=f2:ff:00:00:00:id),
       [0], [ignore], [ignore])
   on_exit "ipsec --rundir $ovs_base/node-id status > $ovs_base/node-id/status"
 ])
@@ -445,11 +448,18 @@  m4_for([LEFT], [1], NODES, [1], [
     fi
 ])])
 
+dnl These are not necessary, but nice to have in the test log in
+dnl order to spot pluto failures during the test.
+on_exit "grep -E 'timed out|outdated|half-loaded|defunct' \
+            $ovs_base/node-*/ovs-monitor-ipsec.log"
+on_exit "grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log"
+
 m4_define([WAIT_FOR_LOADED_CONNS], [
   m4_for([id], [1], NODES, [1], [
     echo "================== node-id ========================="
     iterations=0
     loaded=0
+    active=0
     dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take
     dnl much longer than a default timeout.  The default retransmit timeout
     dnl for pluto is 60 seconds.  Also, we need to make sure pluto didn't
@@ -463,8 +473,11 @@  m4_define([WAIT_FOR_LOADED_CONNS], [
         START_PLUTO([node-id])
       else
         loaded=$(IPSEC_STATUS_LOADED(node-id))
+        m4_if([$1], [active],
+              [active=$(IPSEC_STATUS_ACTIVE(node-id))], [active=$loaded])
       fi
-      if test "$loaded" -ne $(( (NODES - 1) * 2 )); then
+      if test "$loaded" -ne "$(( (NODES - 1) * 2 ))" -o \
+              "$loaded" -ne "$active"; then
         sleep 3
       else
         break
@@ -505,11 +518,64 @@  OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
 dnl Wait for all the connections to be loaded back.
 WAIT_FOR_LOADED_CONNS()
 
-dnl These are not necessary, but nice to have in the test log in
-dnl order to spot pluto failures during the test.
-grep -E 'timed out|outdated|half-loaded|defunct' \
-            $ovs_base/node-*/ovs-monitor-ipsec.log
-grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log
+dnl Next section will check connectivity between all the nodes.
+dnl Different versions of Libreswan 4.x have issues where connections
+dnl are not being correctly established or never become active in a
+dnl way that can not be mitigated from ovs-monitor-ipsec or the test.
+dnl So, only checking connectivity for Libreswan 3- or 5+.
+if ! (ipsec --version 2>&1 | grep -q 'Libreswan 4\.'); then
+  dnl Turn off IPv6 and add static ARP entries for all namespaces to avoid
+  dnl any broadcast / multicast traffic that would otherwise be multiplied
+  dnl by each node creating a traffic storm.  Add specific OpenFlow rules
+  dnl to forward traffic to exact destinations without any MAC learning.
+  m4_for([LEFT], [1], NODES, [1], [
+    NS_CHECK_EXEC([node-LEFT], [sysctl -w net.ipv6.conf.all.disable_ipv6=1],
+                  [0], [ignore])
+    AT_CHECK([ovs-ofctl del-flows unix:$ovs_base/br-ipsec.node-LEFT.mgmt])
+    AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
+                    "dl_dst=f2:ff:00:00:00:LEFT actions=LOCAL"])
+    m4_for([RIGHT], [1], NODES, [1], [
+      if test LEFT -ne RIGHT; then
+        NS_CHECK_EXEC([node-LEFT],
+          [ip neigh add 192.0.0.RIGHT lladdr f2:ff:00:00:00:RIGHT dev br-ipsec])
+        AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
+                    "dl_dst=f2:ff:00:00:00:RIGHT actions=tun-RIGHT"])
+      fi
+    ])
+  ])
+
+  dnl Bring up and add IP addresses for br-ipsec interface.
+  m4_for([id], [1], NODES, [1], [
+    echo "================== node-id ========================="
+    NS_CHECK_EXEC([node-id], [ip addr add 192.0.0.id/24 dev br-ipsec])
+    NS_CHECK_EXEC([node-id], [ip link set dev br-ipsec up])
+  ])
+
+  dnl Wait for all the connections to be loaded and active.  In case one of
+  dnl the pluto processes crashed some of the connections may never become
+  dnl active.  But we did run this loop with a pluto reviving logic twice
+  dnl already, so the chances for pluto to be down here are much lower.
+  WAIT_FOR_LOADED_CONNS([active])
+
+  dnl Check the full mesh ping.
+  m4_for([LEFT], [1], NODES, [1], [
+    m4_for([RIGHT], [1], NODES, [1], [
+      if test LEFT -ne RIGHT; then
+        echo "====== ping: node-LEFT --> node-RIGHT =========="
+        dnl Ping without checking in case connection will recover after the
+        dnl first packet.
+        NS_CHECK_EXEC([node-LEFT],
+                      [ping -q -c 1 -W 2 192.0.0.RIGHT | FORMAT_PING],
+                      [ignore], [stdout])
+        dnl Now check.  If this one fails, there is no actual connectivity.
+        NS_CHECK_EXEC([node-LEFT],
+                      [ping -q -c 3 -i 0.1 -W 2 192.0.0.RIGHT | FORMAT_PING],
+                      [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+      fi
+  ])])
+fi
 
 OVS_TRAFFIC_VSWITCHD_STOP()
 AT_CLEANUP