diff mbox series

[ovs-dev,v2,8/9] tests: ipsec: Add NxN + reconciliation test.

Message ID 20241030135043.3139987-9-i.maximets@ovn.org
State Superseded
Headers show
Series ipsec: Resiliency to Libreswan failures. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Ilya Maximets Oct. 30, 2024, 1:50 p.m. UTC
Add a test to check establishment of IPsec connections among multiple
nodes and check the reconciliation logic along the way.

The test:
  - Creates 20 network namespaces.
  - Starts Libreswan, OVS and ovs-monitor-ipsec in each of them.
  - Adds a geneve tunnel from each namespace to every other namespace.
  - Checks that each namespace has all the IPsec connections loaded.
  - Removes a few connections manually.
  - Checks that these connections are added back.

Unfortunately, many widely used versions of Libreswan have issues
of pluto crashing frequently.  For that reason the test is trying
to bring pluto back online once it finds a dead one.

Also, since retransmit-timeout is 60 seconds and our command timeout
is 120, we can't actually use the OVS_WAIT_UNTIL macro most of the
time, so the checks are done in the custom loop that waits up to
300 seconds.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 tests/system-ipsec.at | 138 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 125 insertions(+), 13 deletions(-)

Comments

Eelco Chaudron Oct. 31, 2024, 11:04 a.m. UTC | #1
On 30 Oct 2024, at 14:50, Ilya Maximets wrote:

> Add a test to check establishment of IPsec connections among multiple
> nodes and check the reconciliation logic along the way.
>
> The test:
>   - Creates 20 network namespaces.
>   - Starts Libreswan, OVS and ovs-monitor-ipsec in each of them.
>   - Adds a geneve tunnel from each namespace to every other namespace.
>   - Checks that each namespace has all the IPsec connections loaded.
>   - Removes a few connections manually.
>   - Checks that these connections are added back.
>
> Unfortunately, many widely used versions of Libreswan have issues
> of pluto crashing frequently.  For that reason the test is trying
> to bring pluto back online once it finds a dead one.
>
> Also, since retransmit-timeout is 60 seconds and our command timeout
> is 120, we can't actually use the OVS_WAIT_UNTIL macro most of the
> time, so the checks are done in the custom loop that waits up to
> 300 seconds.
>
> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

This looks good to me. What a “mess” to get this to work ;)

Acked-by: Eelco Chaudron <echaudro@redhat.com>
diff mbox series

Patch

diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at
index 1e155fece..5aa67bf1d 100644
--- a/tests/system-ipsec.at
+++ b/tests/system-ipsec.at
@@ -8,6 +8,18 @@  m4_define([IPSEC_SETUP_UNDERLAY],
       dnl Set up the underlay switch
       AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])])
 
+m4_define([START_PLUTO], [
+  rm -f $ovs_base/$1/pluto.pid
+  mkdir -p $ovs_base/$1/ipsec.d
+  touch $ovs_base/$1/ipsec.conf
+  touch $ovs_base/$1/secrets
+  ipsec initnss --nssdir $ovs_base/$1/ipsec.d
+  NS_CHECK_EXEC([$1], [ipsec pluto --config $ovs_base/$1/ipsec.conf \
+        --ipsecdir $ovs_base/$1 --nssdir $ovs_base/$1/ipsec.d \
+        --logfile $ovs_base/$1/pluto.log --secretsfile $ovs_base/$1/secrets \
+        --rundir $ovs_base/$1], [0], [], [stderr])
+])
+
 dnl IPSEC_ADD_NODE([namespace], [device], [address], [peer address]))
 dnl
 dnl Creates a dummy host that acts as an IPsec endpoint. Creates host in
@@ -45,15 +57,8 @@  m4_define([IPSEC_ADD_NODE],
   on_exit "kill_ovs_vswitchd `cat $ovs_base/$1/vswitchd.pid`"
 
   dnl Start pluto
-  mkdir -p $ovs_base/$1/ipsec.d
-  touch $ovs_base/$1/ipsec.conf
-  touch $ovs_base/$1/secrets
-  ipsec initnss --nssdir $ovs_base/$1/ipsec.d
-  NS_CHECK_EXEC([$1], [ipsec pluto --config $ovs_base/$1/ipsec.conf \
-        --ipsecdir $ovs_base/$1 --nssdir $ovs_base/$1/ipsec.d \
-        --logfile $ovs_base/$1/pluto.log --secretsfile $ovs_base/$1/secrets \
-        --rundir $ovs_base/$1], [0], [], [stderr])
-  on_exit "kill `cat $ovs_base/$1/pluto.pid`"
+  START_PLUTO([$1])
+  on_exit 'kill $(cat $ovs_base/$1/pluto.pid)'
 
   dnl Start ovs-monitor-ipsec
   NS_CHECK_EXEC([$1], [ovs-monitor-ipsec unix:${OVS_RUNDIR}/$1/db.sock\
@@ -110,16 +115,18 @@  m4_define([CHECK_LIBRESWAN],
 dnl IPSEC_STATUS_LOADED([])
 dnl
 dnl Get number of loaded connections from ipsec status
-m4_define([IPSEC_STATUS_LOADED], [ipsec --rundir $ovs_base/$1 status | \
+m4_define([IPSEC_STATUS_LOADED], [
+           ipsec --rundir $ovs_base/$1 status | \
            grep "Total IPsec connections" | \
-           sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m'])
+           sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-9]]*\), active \([[0-9]]*\).*/\1/m'])
 
 dnl IPSEC_STATUS_ACTIVE([])
 dnl
 dnl Get number of active connections from ipsec status
-m4_define([IPSEC_STATUS_ACTIVE], [ipsec --rundir $ovs_base/$1 status | \
+m4_define([IPSEC_STATUS_ACTIVE], [
+           ipsec --rundir $ovs_base/$1 status | \
            grep "Total IPsec connections" | \
-           sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m'])
+           sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-9]]*\), active \([[0-9]]*\).*/\2/m'])
 
 dnl CHECK_ESP_TRAFFIC()
 dnl
@@ -401,3 +408,108 @@  CHECK_ESP_TRAFFIC
 
 OVS_TRAFFIC_VSWITCHD_STOP()
 AT_CLEANUP
+
+AT_SETUP([IPsec -- Libreswan NxN geneve tunnels + reconciliation])
+AT_KEYWORDS([ipsec libreswan scale reconciliation])
+dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
+dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988
+
+CHECK_LIBRESWAN()
+OVS_TRAFFIC_VSWITCHD_START()
+IPSEC_SETUP_UNDERLAY()
+
+m4_define([NODES], [20])
+
+dnl Set up fake hosts.
+m4_for([id], [1], NODES, [1], [
+  IPSEC_ADD_NODE([node-id], [p-id], 10.1.1.id, 10.1.1.254)
+  AT_CHECK([ovs-pki -b -d ${ovs_base} -l ${ovs_base}/ovs-pki.log \
+                req -u node-id], [0], [stdout])
+  AT_CHECK([ovs-pki -b -d ${ovs_base} -l ${ovs_base}/ovs-pki.log \
+                self-sign node-id], [0], [stdout])
+  AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \
+      other_config:certificate=${ovs_base}/node-id-cert.pem \
+      other_config:private_key=${ovs_base}/node-id-privkey.pem),
+      [0], [ignore], [ignore])
+  on_exit "ipsec --rundir $ovs_base/node-id status > $ovs_base/node-id/status"
+])
+
+dnl Create a full mesh of tunnels.
+m4_for([LEFT], [1], NODES, [1], [
+  m4_for([RIGHT], [1], NODES, [1], [
+    if test LEFT -ne RIGHT; then
+      AT_CHECK(OVS_VSCTL(node-LEFT, add-port br-ipsec tun-RIGHT \
+        -- set Interface tun-RIGHT type=geneve options:remote_ip=10.1.1.RIGHT \
+           options:remote_cert=${ovs_base}/node-RIGHT-cert.pem),
+        [0], [ignore], [ignore])
+    fi
+])])
+
+m4_define([WAIT_FOR_LOADED_CONNS], [
+  m4_for([id], [1], NODES, [1], [
+    echo "================== node-id ========================="
+    iterations=0
+    loaded=0
+    dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take
+    dnl much longer than a default timeout.  The default retransmit timeout
+    dnl for pluto is 60 seconds.  Also, we need to make sure pluto didn't
+    dnl crash in the process and revive it if it did, unfortunately.
+    while true; do
+      date
+      AT_CHECK([ipsec --rundir $ovs_base/node-id status 2>&1 \
+                    | grep -E "whack|Total"], [ignore], [stdout])
+      if grep -E 'is Pluto running?|refused' stdout; then
+        echo "node-id: Pluto died, restarting..."
+        START_PLUTO([node-id])
+      else
+        loaded=$(IPSEC_STATUS_LOADED(node-id))
+      fi
+      if test "$loaded" -ne $(( (NODES - 1) * 2 )); then
+        sleep 3
+      else
+        break
+      fi
+      let iterations=$iterations+1
+      AT_CHECK([test $iterations -lt 100])
+    done
+  ])
+])
+
+dnl Wait for all the connections to be loaded to pluto.  Not waiting for
+dnl them to become active, because if pluto is down on one of the nodes,
+dnl some connections may not become active until we revive it.  Some
+dnl connections may also never become active due to bugs in libreswan 4.x.
+WAIT_FOR_LOADED_CONNS()
+
+AT_CHECK([ipsec auto --help], [ignore], [ignore], [stderr])
+auto=auto
+if test -s stderr; then
+    auto=
+fi
+
+dnl Remove connections for two tunnels.  One fully and one partially.
+AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
+                      --config $ovs_base/node-1/ipsec.conf \
+                      --delete tun-5-out-1], [0], [stdout])
+AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
+                      --config $ovs_base/node-1/ipsec.conf \
+                      --delete tun-2-in-1], [0], [stdout])
+AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
+                      --config $ovs_base/node-1/ipsec.conf \
+                      --delete tun-2-out-1], [0], [stdout])
+
+dnl Wait for the monitor to notice the missing connections.
+OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
+                    $ovs_base/node-1/ovs-monitor-ipsec.log])
+
+dnl Wait for all the connections to be loaded back.
+WAIT_FOR_LOADED_CONNS()
+
+dnl These are not necessary, but nice to have in the test log in
+dnl order to spot pluto failures during the test.
+grep -E 'timed out|outdated|half-loaded|defunct' \
+            $ovs_base/node-*/ovs-monitor-ipsec.log
+grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log
+
+OVS_TRAFFIC_VSWITCHD_STOP()
+AT_CLEANUP