@@ -514,8 +514,7 @@ conn prevent_unencrypted_vxlan
self.IPSEC_D = "sql:" + libreswan_root_prefix + ipsec_d
self.IPSEC_CTL = libreswan_root_prefix + ipsec_ctl
self.conf_file = None
- self.conns_not_active = set()
- self.last_refresh = time.time()
+ self.conns_not_active = {"added": {}, "started": {}}
self.secrets_file = None
self.use_default_conn = self.IPSEC_CONF == self.ROOT_IPSEC_CONF
self.use_default_crypto = args.use_default_crypto
@@ -642,6 +641,9 @@ conn prevent_unencrypted_vxlan
name = self.CERT_PREFIX + tunnel.conf["remote_name"]
self._nss_delete_cert(name)
+ def conn_should_reconcile(self, timestamp):
+ return time.time() >= timestamp + RECONCILIATION_INTERVAL
+
def refresh(self, monitor):
vlog.info("Refreshing LibreSwan configuration")
run_command(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL,
@@ -662,12 +664,18 @@ conn prevent_unencrypted_vxlan
active = set(active_conns.get(name, dict()).keys())
# Untrack connections that became active.
- self.conns_not_active.difference_update(active)
+ for conn in active:
+ self.conns_not_active["added"].pop(conn, None)
+ self.conns_not_active["started"].pop(conn, None)
+
# Remove connections that didn't become active after --start
- # and another explicit --up.
- for conn in self.conns_not_active & loaded:
- self._delete_ipsec_connection(conn, "is defunct")
- loaded.remove(conn)
+ # or an explicit --up.
+ for conn in list(loaded):
+ started = self.conns_not_active["started"].get(conn, None)
+
+ if started and self.conn_should_reconcile(started):
+ self._delete_ipsec_connection(conn, "is defunct")
+ loaded.remove(conn)
# Remove all the loaded or active but not desired connections.
for conn in loaded | active:
@@ -706,10 +714,11 @@ conn prevent_unencrypted_vxlan
# desired == loaded and desired >= loaded + active,
# so loaded >= active
for conn in loaded - active:
- vlog.info("Bringing up ipsec connection %s" % conn)
- # On failure to --up it will be removed from the set.
- self.conns_not_active.add(conn)
- self._start_ipsec_connection(conn, "up")
+ added = self.conns_not_active["added"].get(conn, None)
+
+ if added and self.conn_should_reconcile(added):
+ vlog.info("Bringing up ipsec connection %s" % conn)
+ self._start_ipsec_connection(conn, "up")
# Update shunt policy if changed
if monitor.conf_in_use["skb_mark"] != monitor.conf["skb_mark"]:
@@ -746,7 +755,13 @@ conn prevent_unencrypted_vxlan
"--delete",
"--asynchronous", "prevent_unencrypted_vxlan"])
monitor.conf_in_use["skb_mark"] = monitor.conf["skb_mark"]
- self.last_refresh = time.time()
+
+ if monitor.tunnels and \
+ not self.conns_not_active["added"] and \
+ not self.conns_not_active["started"]:
+ vlog.info("Connections for all(%d) configured tunnels are Up."
+ % len(monitor.tunnels))
+
vlog.info("Refreshing is done.")
def get_conns_from_status(self, pattern):
@@ -809,31 +824,51 @@ conn prevent_unencrypted_vxlan
return conns
def need_to_reconcile(self, monitor):
- if time.time() - self.last_refresh < RECONCILIATION_INTERVAL:
- return False
+ timestamps = (
+ timestamp for group in ("added", "started")
+ for timestamp in self.conns_not_active[group].values()
+ )
+ earliest = min(timestamps, default=None)
+
+ if earliest and self.conn_should_reconcile(earliest):
+ vlog.dbg("Need to check the state of the following connections: %s"
+ % self.conns_not_active)
+ return True
conns_dict = self.get_active_conns()
for ifname, tunnel in monitor.tunnels.items():
- if ifname not in conns_dict:
- vlog.info("Connection for port %s is not active, "
- "need to reconcile" % ifname)
- return True
-
- existing_conns = conns_dict.get(ifname)
+ existing_conns = set(conns_dict.get(ifname, dict()).keys())
desired_conns = self.get_conn_names(monitor, ifname)
- if set(existing_conns.keys()) != set(desired_conns):
+ for conn in existing_conns.symmetric_difference(desired_conns):
+ timestamp = None
+ for group in ("added", "started"):
+ timestamp = self.conns_not_active[group].get(conn, None)
+ if timestamp:
+ break
+
+ # If connection is on one of the 'not_active' lists, it should
+ # not be reconciled, unless sufficient time have passed.
+ #
+ # If it's not on any of the lists, it means that one of the
+ # previously active connections somehow became non-active, and
+ # we need to reconcile.
+ if timestamp and not self.conn_should_reconcile(timestamp):
+ continue
+
vlog.info("Active connections for port %s %s do not match "
"desired %s, need to reconcile"
- % (ifname, list(existing_conns.keys()),
- desired_conns))
+ % (ifname, list(existing_conns), desired_conns))
return True
return False
def _delete_ipsec_connection(self, conn, reason):
vlog.info("%s %s, removing" % (conn, reason))
- self.conns_not_active.discard(conn)
+
+ self.conns_not_active["added"].pop(conn, None)
+ self.conns_not_active["started"].pop(conn, None)
+
run_command(self.IPSEC_AUTO +
["--ctlsocket", self.IPSEC_CTL,
"--config", self.ROOT_IPSEC_CONF,
@@ -858,6 +893,15 @@ conn prevent_unencrypted_vxlan
# We don't know in which state the connection was left on
# failure. Try to clean it up.
self._delete_ipsec_connection(conn, "--%s failed" % action)
+ else:
+ # Update timestamps, make sure the connection is only tracked
+ # in one of the 'not_active' lists.
+ if action == "add":
+ self.conns_not_active["added"][conn] = time.time()
+ self.conns_not_active["started"].pop(conn, None)
+ else:
+ self.conns_not_active["added"].pop(conn, None)
+ self.conns_not_active["started"][conn] = time.time()
def _nss_clear_database(self):
"""Remove all OVS IPsec related state from the NSS database"""
@@ -535,6 +535,96 @@ AT_CHECK([grep -q -E "(ike|ikev2|esp)=" $ovs_base/right/custom.conf], [1])
OVS_TRAFFIC_VSWITCHD_STOP()
AT_CLEANUP
+AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv4])
+AT_KEYWORDS([ipsec libreswan ipv4 geneve psk reconciliation])
+dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
+dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988
+
+CHECK_LIBRESWAN()
+OVS_TRAFFIC_VSWITCHD_START()
+IPSEC_SETUP_UNDERLAY()
+
+m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish])
+dnl Set up only the left host.
+IPSEC_ADD_NODE_LEFT([10.1.1.1], [10.1.1.254])
+IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([10.1.1.2]))
+dnl Wait for the monitor to find the new connection.
+OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log])
+
+dnl Add a few more tunels waiting for the monitor to find them too.
+m4_for([id], [1], 5, [1], [
+ OVS_VSCTL([left], add-port br-ipsec tun-id \
+ -- set Interface tun-id type=geneve PSK_OPTIONS(10.1.2.id))
+ OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log)
+])
+dnl And now remove all the extra tunnels.
+m4_for([id], [1], 5, [1], [
+ OVS_VSCTL([left], del-port tun-id)
+ OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log)
+])
+
+dnl Check that none of the connections were marked for reconciliation yet.
+dnl It should take at least 15 seconds for this to happen.
+AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1])
+AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1])
+dnl But we should eventually attempt to reconcile the original 'tun' tunnel.
+OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \
+ left/ovs-monitor-ipsec.log])
+dnl Now add the right host and check that connection works properly.
+IPSEC_ADD_NODE_RIGHT([10.1.1.2], [10.1.1.254])
+IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([10.1.1.1]))
+
+CHECK_ESP_TRAFFIC
+
+OVS_TRAFFIC_VSWITCHD_STOP()
+AT_CLEANUP
+
+AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv6])
+AT_KEYWORDS([ipsec libreswan ipv6 geneve psk reconciliation])
+dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
+dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988
+
+CHECK_LIBRESWAN()
+OVS_TRAFFIC_VSWITCHD_START()
+IPSEC_SETUP_UNDERLAY()
+
+m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish])
+
+dnl Set up only the left host.
+IPSEC_ADD_NODE_LEFT([fd01::101], [fd01::254])
+IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([fd01::102]))
+dnl Wait for the monitor to find the new connection.
+OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log])
+
+dnl Add a few more tunels waiting for the monitor to find them too.
+m4_for([id], [1], 5, [1], [
+ OVS_VSCTL([left], add-port br-ipsec tun-id \
+ -- set Interface tun-id type=geneve PSK_OPTIONS(fd02::id))
+ OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log)
+])
+dnl And now remove all the extra tunnels.
+m4_for([id], [1], 5, [1], [
+ OVS_VSCTL([left], del-port tun-id)
+ OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log)
+])
+
+dnl Check that none of the connections were marked for reconciliation yet.
+dnl It should take at least 15 seconds for this to happen.
+AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1])
+AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1])
+dnl But we should eventually attempt to reconcile the original 'tun' tunnel.
+OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \
+ left/ovs-monitor-ipsec.log])
+
+dnl Now add the right host and check that connection works properly.
+IPSEC_ADD_NODE_RIGHT([fd01::102], [fd01::254])
+IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([fd01::101]))
+
+CHECK_ESP_TRAFFIC
+
+OVS_TRAFFIC_VSWITCHD_STOP()
+AT_CLEANUP
+
AT_SETUP([IPsec -- Libreswan - established conns survive new additions - ipv4])
AT_KEYWORDS([ipsec libreswan ipv4 geneve psk persistence])
dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
@@ -752,6 +842,10 @@ if test -s stderr; then
auto=
fi
+dnl Wait for the monitor to be done with all the connections.
+OVS_WAIT_UNTIL([grep -q 'all(19) configured tunnels are Up' \
+ $ovs_base/node-1/ovs-monitor-ipsec.log])
+
dnl Remove connections for two tunnels. One fully and one partially.
AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
--config $ovs_base/node-1/ipsec.conf \
@@ -764,7 +858,7 @@ AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
--delete tun-2-out-1], [0], [stdout])
dnl Wait for the monitor to notice the missing connections.
-OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
+OVS_WAIT_UNTIL([grep -qE 'tun-[[25]] .*need to reconcile' \
$ovs_base/node-1/ovs-monitor-ipsec.log])
dnl Wait for all the connections to be loaded back.
Currently we're only tracking the last refresh time and perform reconciliation of non-active connections on every refresh. This is causing issues in large clusters when tunnels are added sequentially. Consider the following example: 1. Tun-1 added -> refresh() -> Tun-1: adding 'in' and starting 'out'. 2. Tun-2 added -> refresh() -> Tun-2: adding 'in' and starting 'out'. -> Tun-1: The other side didn't have time to initiate the 'in' connection yet, so it is not active. But we see that it's not active and trying to start it. 3. Tun-3 added -> refresh() -> Tun-3: adding 'in' and starting 'out'. -> Tun-2: The other side didn't have time to initiate the 'in' connection yet, so it is not active. But we see that it's not active and trying to start it. -> Tun-1: The connection still had no time to become active, but we declare it 'defunct' and re-creating. Behavior above is specific to Libreswan 4. Libreswan 5 will report UP connections as active in most cases, so they will not be marked as defunct, but they will still be started quickly after addition when it is not needed. This creates unnecessary churn in the cluster and puts Libreswan into an uncomfortable position where crossing stream issues (where both sides are trying to establish the same connection at the same time) are far more likely. Fix that by specifically tracking time when we add or start each connection instead of just the last time we refreshed for any reason. This should make ovs-monitor-ipsec to actually wait for the reconciliation interval before attempting to repair connections and give Libreswan a decent amount of time to process the changes and try to establish connections normally. Note: even though we could precisely track 15 seconds for each individual connection and wake up when exactly 15 seconds expire, we're not doing that in this patch. The reason is that we still need to wake up every 15 seconds to check that all the previously active connections are still active, and doing that allows for refreshing many connections in the same run instead of waking up every second just for one connection. Fixes: 25a301822e0d ("ipsec: libreswan: Reconcile missing connections periodically.") Reported-at: https://issues.redhat.com/browse/FDP-1364 Signed-off-by: Ilya Maximets <i.maximets@ovn.org> --- Version 2: * Changed the logic to track individual refresh time per non-active connection. [Eelco] ipsec/ovs-monitor-ipsec.in | 92 ++++++++++++++++++++++++++---------- tests/system-ipsec.at | 96 +++++++++++++++++++++++++++++++++++++- 2 files changed, 163 insertions(+), 25 deletions(-)