========
On certain Fortville NICs it has been observed that PHY UP detection can
get delayed (sometimes up to 4-5 secs). When the driver fails to fetch
PHY status as UP even though its actually UP, LACP packets can get
exchanged and LACP slave brought UP. In such a case, the remote end
would start sending traffic on that slave, but OVS drops it, as the
bond-slave is not yet enabled due to PHY DOWN.
Fix:
====
The main intention here is delay LACP negotiation until carrier (PHY)
status is read as UP.
1. In port_run()/bundle_run(), cache the carrier status in
"struct ofport_dpif"/"struct bond_slave".
2. When carrier state is DOWN, do not send any LACPDUs and
drop any received LACPDUs.
3. When LACP state changes or LACPDU is received, trigger re-checking
of carrier-state (in port_run()) by incrementing the connectivity
sequence number to find out the true carrier state as fast as
possible.
Signed-off-by: Manohar K C
<manohar.krishnappa.chidambaraswamy@ericsson.com>
CC: Jan Scheurich <jan.scheurich@ericsson.com>
CC: Nitin Katiyar <nitin.katiyar@ericsson.com>
---
v1 1/2: Evaluate lacp may_enable inline in the datapath thread and
check for pending "enable" in the bond admissibility check to avoid
drops.
lib/lacp.c | 51 +++++++++++++++++++++++++++++++++++++-------
lib/lacp.h | 6 +++---
ofproto/bond.c | 30 ++++++++++++++++++++++++++
ofproto/bond.h | 3 +++
ofproto/ofproto-dpif-xlate.c | 4 +++-
ofproto/ofproto-dpif.c | 11 +++++++---
tests/ofproto-dpif.at | 3 +++
7 files changed, 93 insertions(+), 15 deletions(-)
@@ -33,6 +33,7 @@
#include "unixctl.h"
#include "openvswitch/vlog.h"
#include "util.h"
+#include "ofproto/bond.h"
VLOG_DEFINE_THIS_MODULE(lacp);
@@ -148,7 +149,7 @@ static void slave_get_actor(struct slave *, struct lacp_info *actor)
OVS_REQUIRES(mutex);
static void slave_get_priority(struct slave *, struct lacp_info *priority)
OVS_REQUIRES(mutex);
-static bool slave_may_tx(const struct slave *)
+static bool slave_may_tx(const void *bond, const struct slave *)
OVS_REQUIRES(mutex);
static struct slave *slave_lookup(const struct lacp *, const void *slave)
OVS_REQUIRES(mutex);
@@ -325,7 +326,7 @@ lacp_is_active(const struct lacp *lacp) OVS_EXCLUDED(mutex)
* called on all packets received on 'slave_' with Ethernet Type ETH_TYPE_LACP.
*/
void
-lacp_process_packet(struct lacp *lacp, const void *slave_,
+lacp_process_packet(struct lacp *lacp, const void *bond, const void *slave_,
const struct dp_packet *packet)
OVS_EXCLUDED(mutex)
{
@@ -333,6 +334,7 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
const struct lacp_pdu *pdu;
long long int tx_rate;
struct slave *slave;
+ bool carrier_up = false;
lacp_lock();
slave = slave_lookup(lacp, slave_);
@@ -348,6 +350,21 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
goto out;
}
+ /*
+ * On some NICs L1 state reporting is slow. In case LACP packets are
+ * received while carrier (L1) state is still down, drop the LACPDU and
+ * trigger re-checking of L1 state.
+ */
+ carrier_up = bond_slave_get_carrier(bond, slave_);
+ if (!carrier_up) {
+ seq_change(connectivity_seq_get());
+
+ VLOG_INFO("carrier still DOWN - conn seq changed for slave %s, "
+ "dropping packet\n", slave->name);
+
+ goto out;
+ }
+
slave->status = LACP_CURRENT;
tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
timer_set_duration(&slave->rx, LACP_RX_MULTIPLIER * tx_rate);
@@ -521,7 +538,8 @@ lacp_slave_is_current(const struct lacp *lacp, const void *slave_)
/* This function should be called periodically to update 'lacp'. */
void
-lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
+lacp_run(struct lacp *lacp, const void *bond,
+ lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
{
struct slave *slave;
@@ -551,7 +569,7 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
HMAP_FOR_EACH (slave, node, &lacp->slaves) {
struct lacp_info actor;
- if (!slave_may_tx(slave)) {
+ if (!slave_may_tx(bond, slave)) {
continue;
}
@@ -580,13 +598,13 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
/* Causes poll_block() to wake up when lacp_run() needs to be called again. */
void
-lacp_wait(struct lacp *lacp) OVS_EXCLUDED(mutex)
+lacp_wait(struct lacp *lacp, const void *bond) OVS_EXCLUDED(mutex)
{
struct slave *slave;
lacp_lock();
HMAP_FOR_EACH (slave, node, &lacp->slaves) {
- if (slave_may_tx(slave)) {
+ if (slave_may_tx(bond, slave)) {
timer_wait(&slave->tx);
}
@@ -810,9 +828,26 @@ slave_get_priority(struct slave *slave, struct lacp_info *priority)
}
static bool
-slave_may_tx(const struct slave *slave) OVS_REQUIRES(mutex)
+slave_may_tx(const void *bond, const struct slave *slave) OVS_REQUIRES(mutex)
{
- return slave->lacp->active || slave->status != LACP_DEFAULTED;
+ bool carrier_up = true;
+
+ if (bond) {
+ /*
+ * On some NICs L1 state reporting is slow. If carrier (L1) state is
+ * still down, do not send LACPDU and trigger re-checking of L1 state.
+ */
+ carrier_up = bond_slave_get_carrier(bond, slave->aux);
+ if (!carrier_up) {
+ seq_change(connectivity_seq_get());
+
+ VLOG_INFO("carrier still DOWN - conn seq changed for slave %s, "
+ "avoiding tx\n", slave->name);
+ }
+ }
+
+ return carrier_up &&
+ (slave->lacp->active || slave->status != LACP_DEFAULTED);
}
static struct slave *
@@ -46,7 +46,7 @@ struct lacp *lacp_ref(const struct lacp *);
void lacp_configure(struct lacp *, const struct lacp_settings *);
bool lacp_is_active(const struct lacp *);
-void lacp_process_packet(struct lacp *, const void *slave,
+void lacp_process_packet(struct lacp *, const void *bond, const void *slave,
const struct dp_packet *packet);
enum lacp_status lacp_status(const struct lacp *);
@@ -67,8 +67,8 @@ bool lacp_slave_is_current(const struct lacp *, const void *slave_);
/* Callback function for lacp_run() for sending a LACP PDU. */
typedef void lacp_send_pdu(void *slave, const void *pdu, size_t pdu_size);
-void lacp_run(struct lacp *, lacp_send_pdu *);
-void lacp_wait(struct lacp *);
+void lacp_run(struct lacp *, const void *bond, lacp_send_pdu *);
+void lacp_wait(struct lacp *, const void *bond);
struct lacp_slave_stats {
/* id */
@@ -93,6 +93,7 @@ struct bond_slave {
/* Link status. */
bool enabled; /* May be chosen for flows? */
bool may_enable; /* Client considers this slave bondable. */
+ bool carrier_up; /* Carrier state from NIC port */
long long delay_expires; /* Time after which 'enabled' may change. */
/* Rebalancing info. Used only by bond_rebalance(). */
@@ -633,6 +634,35 @@ bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
ovs_rwlock_unlock(&rwlock);
}
@@ -6322,6 +6322,9 @@ OVS_VSWITCHD_START([dnl
other_config:lacp-port-priority=222 \
other_config:lacp-aggregation-key=3333 ])
+ovs-appctl netdev-dummy/set-admin-state p1 up
+ovs-appctl netdev-dummy/set-admin-state p2 up
+
on_exit 'kill `cat test-sflow.pid`'
AT_CHECK([ovstest test-sflow --log-file --detach --no-chdir --pidfile 0:127.0.0.1 > sflow.log], [0], [], [ignore])
AT_CAPTURE_FILE([sflow.log])
_______________________________________________
dev mailing list
dev@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev