diff mbox series

[ovs-dev,v3,2/2] Don't send or receive LACP PDUs when carrier state of slave is down

Message ID 1560118772-28225-1-git-send-email-nitin.katiyar@ericsson.com
State Accepted
Headers show
Series None | expand

Commit Message

Nitin Katiyar June 9, 2019, 2:18 p.m. UTC
Fortville NICs (or their drivers) can get into an inconsistent state,
in which the NIC can actually transmit and receive packets even
though they report "PHY down". In such a state, OVS can exchange and
process LACP messages and enable a LACP slave. However, further packet
exchange over the slave fails because OVS sees that the PHY is down.

This commit fixes the problem by making OVS ignore received LACP PDUs
and suppress transmitting LACP PDUs when carrier is down. In addition,
when a LACP PDU is received with carrier down, this commit triggers
rechecking the carrier status (by incrementing the connectivity sequence
number) to ensure that it is updated as quickly as possible.

Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
---
 lib/lacp.c             | 35 ++++++++++++++++++++++++++++++-----
 lib/lacp.h             |  3 ++-
 ofproto/ofproto-dpif.c | 14 +++++++-------
 3 files changed, 39 insertions(+), 13 deletions(-)

Comments

Ben Pfaff June 10, 2019, 4:14 p.m. UTC | #1
On Sun, Jun 09, 2019 at 02:18:10PM +0000, Nitin Katiyar wrote:
> Fortville NICs (or their drivers) can get into an inconsistent state,
> in which the NIC can actually transmit and receive packets even
> though they report "PHY down". In such a state, OVS can exchange and
> process LACP messages and enable a LACP slave. However, further packet
> exchange over the slave fails because OVS sees that the PHY is down.
> 
> This commit fixes the problem by making OVS ignore received LACP PDUs
> and suppress transmitting LACP PDUs when carrier is down. In addition,
> when a LACP PDU is received with carrier down, this commit triggers
> rechecking the carrier status (by incrementing the connectivity sequence
> number) to ensure that it is updated as quickly as possible.

Thanks, applied to master.
diff mbox series

Patch

diff --git a/lib/lacp.c b/lib/lacp.c
index e768012..16c823b 100644
--- a/lib/lacp.c
+++ b/lib/lacp.c
@@ -122,6 +122,7 @@  struct slave {
 
     enum slave_status status;     /* Slave status. */
     bool attached;                /* Attached. Traffic may flow. */
+    bool carrier_up;              /* Carrier state of link. */
     struct lacp_info partner;     /* Partner information. */
     struct lacp_info ntt_actor;   /* Used to decide if we Need To Transmit. */
     struct timer tx;              /* Next message transmission timer. */
@@ -350,6 +351,16 @@  lacp_process_packet(struct lacp *lacp, const void *slave_,
         goto out;
     }
 
+    /* On some NICs L1 state reporting is slow. In case LACP packets are
+     * received while carrier (L1) state is still down, drop the LACP PDU and
+     * trigger re-checking of L1 state. */
+    if (!slave->carrier_up) {
+        VLOG_INFO_RL(&rl, "%s: carrier state is DOWN,"
+                     " dropping received LACP PDU.", slave->name);
+        seq_change(connectivity_seq_get());
+        goto out;
+    }
+
     slave->status = LACP_CURRENT;
     tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
     timer_set_duration(&slave->rx, LACP_RX_MULTIPLIER * tx_rate);
@@ -456,7 +467,8 @@  lacp_slave_unregister(struct lacp *lacp, const void *slave_)
 /* This function should be called whenever the carrier status of 'slave_' has
  * changed.  If 'lacp' is null, this function has no effect.*/
 void
-lacp_slave_carrier_changed(const struct lacp *lacp, const void *slave_)
+lacp_slave_carrier_changed(const struct lacp *lacp, const void *slave_,
+                           bool carrier_up)
     OVS_EXCLUDED(mutex)
 {
     struct slave *slave;
@@ -473,7 +485,11 @@  lacp_slave_carrier_changed(const struct lacp *lacp, const void *slave_)
     if (slave->status == LACP_CURRENT || slave->lacp->active) {
         slave_set_expired(slave);
     }
-    slave->count_carrier_changed++;
+
+    if (slave->carrier_up != carrier_up) {
+        slave->carrier_up = carrier_up;
+        slave->count_carrier_changed++;
+    }
 
 out:
     lacp_unlock();
@@ -498,11 +514,18 @@  lacp_slave_may_enable(const struct lacp *lacp, const void *slave_)
 {
     if (lacp) {
         struct slave *slave;
-        bool ret;
+        bool ret = false;
 
         lacp_lock();
         slave = slave_lookup(lacp, slave_);
-        ret = slave ? slave_may_enable__(slave) : false;
+        if (slave) {
+            /* It is only called when carrier is up. So, enable slave's
+             * carrier state if it is currently down. */
+            if (!slave->carrier_up) {
+                slave->carrier_up = true;
+            }
+            ret = slave_may_enable__(slave);
+        }
         lacp_unlock();
         return ret;
     } else {
@@ -820,7 +843,9 @@  slave_get_priority(struct slave *slave, struct lacp_info *priority)
 static bool
 slave_may_tx(const struct slave *slave) OVS_REQUIRES(mutex)
 {
-    return slave->lacp->active || slave->status != LACP_DEFAULTED;
+    /* Check for L1 state as well as LACP state. */
+    return (slave->carrier_up) && ((slave->lacp->active) ||
+            (slave->status != LACP_DEFAULTED));
 }
 
 static struct slave *
diff --git a/lib/lacp.h b/lib/lacp.h
index 0dfaef0..d731ae9 100644
--- a/lib/lacp.h
+++ b/lib/lacp.h
@@ -60,7 +60,8 @@  struct lacp_slave_settings {
 void lacp_slave_register(struct lacp *, void *slave_,
                          const struct lacp_slave_settings *);
 void lacp_slave_unregister(struct lacp *, const void *slave);
-void lacp_slave_carrier_changed(const struct lacp *, const void *slave);
+void lacp_slave_carrier_changed(const struct lacp *, const void *slave,
+                                bool carrier_up);
 bool lacp_slave_may_enable(const struct lacp *, const void *slave);
 bool lacp_slave_is_current(const struct lacp *, const void *slave_);
 
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index db461ac..cb6baf3 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -3606,11 +3606,6 @@  ofport_update_peer(struct ofport_dpif *ofport)
 static bool
 may_enable_port(struct ofport_dpif *ofport)
 {
-    /* Carrier must be up. */
-    if (!netdev_get_carrier(ofport->up.netdev)) {
-        return false;
-    }
-
     /* If CFM or BFD is enabled, then at least one of them must report that the
      * port is up. */
     if ((ofport->bfd || ofport->cfm)
@@ -3636,12 +3631,17 @@  port_run(struct ofport_dpif *ofport)
 {
     long long int carrier_seq = netdev_get_carrier_resets(ofport->up.netdev);
     bool carrier_changed = carrier_seq != ofport->carrier_seq;
+    bool enable = netdev_get_carrier(ofport->up.netdev);
+
     ofport->carrier_seq = carrier_seq;
     if (carrier_changed && ofport->bundle) {
-        lacp_slave_carrier_changed(ofport->bundle->lacp, ofport);
+        lacp_slave_carrier_changed(ofport->bundle->lacp, ofport, enable);
+    }
+
+    if (enable) {
+        enable = may_enable_port(ofport);
     }
 
-    bool enable = may_enable_port(ofport);
     if (ofport->up.may_enable != enable) {
         ofproto_port_set_enable(&ofport->up, enable);