diff mbox

[net-next] bond: add support to read speed and duplex via ethtool

Message ID 1362595173-11442-1-git-send-email-andy@greyhouse.net
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Andy Gospodarek March 6, 2013, 6:39 p.m. UTC
This patch adds support for the get_settings ethtool op to the bonding
driver.  This was motivated by users who wanted to get the speed of the
bond and compare that against throughput to understand utilization.
The behavior before this patch was added was problematic when computing
line utilization after trying to get link-speed and throughput via SNMP.

The general plan for computing link-speed was as follows:

Mode                 Formula
----                 -------
active-backup        speed of current active slave
broadcast            speed of first slave with known speed
all other modes      aggregate speed of all slaves with known speed

Output from ethtool looks like this for a round-robin bond:

Settings for bond0:
	Supported ports: [ ]
	Supported link modes:   Not reported
	Supported pause frame use: No
	Supports auto-negotiation: No
	Advertised link modes:  Not reported
	Advertised pause frame use: No
	Advertised auto-negotiation: No
	Speed: 11000Mb/s
	Duplex: Full
	Port: Twisted Pair
	PHYAD: 0
	Transceiver: internal
	Auto-negotiation: off
	MDI-X: Unknown
	Link detected: yes

I tested this and verified it works as expected.  A test was also done
on a version backported to an older kernel and it worked well there.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
---
 drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

Comments

Veaceslav Falico March 6, 2013, 7:13 p.m. UTC | #1
On Wed, Mar 6, 2013 at 7:39 PM, Andy Gospodarek <andy@greyhouse.net> wrote:
> This patch adds support for the get_settings ethtool op to the bonding
> driver.  This was motivated by users who wanted to get the speed of the
> bond and compare that against throughput to understand utilization.
> The behavior before this patch was added was problematic when computing
> line utilization after trying to get link-speed and throughput via SNMP.
>
> The general plan for computing link-speed was as follows:
>
> Mode                 Formula
> ----                 -------
> active-backup        speed of current active slave
> broadcast            speed of first slave with known speed
> all other modes      aggregate speed of all slaves with known speed
>
> Output from ethtool looks like this for a round-robin bond:
>
> Settings for bond0:
>         Supported ports: [ ]
>         Supported link modes:   Not reported
>         Supported pause frame use: No
>         Supports auto-negotiation: No
>         Advertised link modes:  Not reported
>         Advertised pause frame use: No
>         Advertised auto-negotiation: No
>         Speed: 11000Mb/s
>         Duplex: Full
>         Port: Twisted Pair
>         PHYAD: 0
>         Transceiver: internal
>         Auto-negotiation: off
>         MDI-X: Unknown
>         Link detected: yes
>
> I tested this and verified it works as expected.  A test was also done
> on a version backported to an older kernel and it worked well there.
>
> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> ---
>  drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 7bd068a..6e70ff0 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
>         }
>  }
>
> +static int bond_ethtool_get_settings(struct net_device *bond_dev,
> +                                    struct ethtool_cmd *ecmd)
> +{
> +       struct bonding *bond = netdev_priv(bond_dev);
> +       struct slave *slave;
> +       int i;
> +       unsigned long speed = 0;
> +
> +       ecmd->speed = SPEED_UNKNOWN;
> +       ecmd->duplex = DUPLEX_UNKNOWN;
> +
> +       read_lock(&bond->lock);
> +       switch (bond->params.mode) {
> +       case BOND_MODE_ACTIVEBACKUP:
> +               read_lock(&bond->curr_slave_lock);
> +               if (bond->curr_active_slave &&
> +                   bond->curr_active_slave->speed != SPEED_UNKNOWN) {
> +                       ecmd->speed = bond->curr_active_slave->speed;
> +                       ecmd->duplex = bond->curr_active_slave->duplex;
> +               }
> +               read_unlock(&bond->curr_slave_lock);
> +               break;
> +       case BOND_MODE_BROADCAST:
> +               bond_for_each_slave(bond, slave, i) {
> +                       if (slave->speed != SPEED_UNKNOWN) {
> +                               ecmd->speed = slave->speed;
> +                               ecmd->duplex = slave->duplex;

Maybe it should be 'the smallest speed (and then duplex)' here? In
broadcast we send all packets
through all the slaves, so the effective speed/duplex would be the
speed/duplex of the slowest slave.

> +                               break;
> +                       }
> +               }
> +               break;
> +       default:
> +               bond_for_each_slave(bond, slave, i) {
> +                       if (slave->speed != SPEED_UNKNOWN) {
> +                               speed += slave->speed;
> +                       }
> +                       if (ecmd->duplex == DUPLEX_UNKNOWN &&
> +                           slave->duplex != DUPLEX_UNKNOWN)
> +                               ecmd->duplex = slave->duplex;
> +               }
> +               ecmd->speed = speed;
> +       }
> +       read_unlock(&bond->lock);
> +       return 0;
> +}
> +
>  static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
>                                      struct ethtool_drvinfo *drvinfo)
>  {
> @@ -4235,6 +4281,7 @@ static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
>
>  static const struct ethtool_ops bond_ethtool_ops = {
>         .get_drvinfo            = bond_ethtool_get_drvinfo,
> +       .get_settings           = bond_ethtool_get_settings,
>         .get_link               = ethtool_op_get_link,
>  };
>
> --
> 1.7.11.7
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh March 6, 2013, 7:25 p.m. UTC | #2
Andy Gospodarek <andy@greyhouse.net> wrote:

>This patch adds support for the get_settings ethtool op to the bonding
>driver.  This was motivated by users who wanted to get the speed of the
>bond and compare that against throughput to understand utilization.
>The behavior before this patch was added was problematic when computing
>line utilization after trying to get link-speed and throughput via SNMP.
>
>The general plan for computing link-speed was as follows:
>
>Mode                 Formula
>----                 -------
>active-backup        speed of current active slave
>broadcast            speed of first slave with known speed
>all other modes      aggregate speed of all slaves with known speed

	I'll just point out that the balance-tlb mode is asymmetric; it
uses all slaves for transmission, but only one slave for reception.
Ethtool only has a single speed for both directions, so this is probably
the best choice, but it should still be noted.

>Output from ethtool looks like this for a round-robin bond:
>
>Settings for bond0:
>	Supported ports: [ ]
>	Supported link modes:   Not reported
>	Supported pause frame use: No
>	Supports auto-negotiation: No
>	Advertised link modes:  Not reported
>	Advertised pause frame use: No
>	Advertised auto-negotiation: No
>	Speed: 11000Mb/s
>	Duplex: Full
>	Port: Twisted Pair
>	PHYAD: 0
>	Transceiver: internal
>	Auto-negotiation: off
>	MDI-X: Unknown
>	Link detected: yes
>
>I tested this and verified it works as expected.  A test was also done
>on a version backported to an older kernel and it worked well there.
>
>Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
>---
> drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 7bd068a..6e70ff0 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
> 	}
> }
>
>+static int bond_ethtool_get_settings(struct net_device *bond_dev,
>+				     struct ethtool_cmd *ecmd)
>+{
>+	struct bonding *bond = netdev_priv(bond_dev);
>+	struct slave *slave;
>+	int i;
>+	unsigned long speed = 0;
>+
>+	ecmd->speed = SPEED_UNKNOWN;
>+	ecmd->duplex = DUPLEX_UNKNOWN;
>+
>+	read_lock(&bond->lock);
>+	switch (bond->params.mode) {
>+	case BOND_MODE_ACTIVEBACKUP:
>+		read_lock(&bond->curr_slave_lock);
>+		if (bond->curr_active_slave &&
>+		    bond->curr_active_slave->speed != SPEED_UNKNOWN) {
>+			ecmd->speed = bond->curr_active_slave->speed;
>+			ecmd->duplex = bond->curr_active_slave->duplex;
>+		}
>+		read_unlock(&bond->curr_slave_lock);
>+		break;
>+	case BOND_MODE_BROADCAST:
>+		bond_for_each_slave(bond, slave, i) {
>+			if (slave->speed != SPEED_UNKNOWN) {
>+				ecmd->speed = slave->speed;
>+				ecmd->duplex = slave->duplex;
>+				break;
>+			}
>+		}
>+		break;

	Does anybody really use broadcast mode?  Not that I'm saying
this is incorrect, I'm just wondering in general.

>+	default:
>+		bond_for_each_slave(bond, slave, i) {
>+			if (slave->speed != SPEED_UNKNOWN) {
>+				speed += slave->speed;
>+			}
>+			if (ecmd->duplex == DUPLEX_UNKNOWN &&
>+			    slave->duplex != DUPLEX_UNKNOWN)
>+				ecmd->duplex = slave->duplex;

	Should the calculations skip slaves that are not BOND_LINK_UP?
If the ARP monitor is running, some slaves may be carrier up (and have
slave->speed set), but are not actually in use by the bond, at least for
transmission.

	-J

>+		}
>+		ecmd->speed = speed;
>+	}
>+	read_unlock(&bond->lock);
>+	return 0;
>+}
>+
> static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
> 				     struct ethtool_drvinfo *drvinfo)
> {
>@@ -4235,6 +4281,7 @@ static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
>
> static const struct ethtool_ops bond_ethtool_ops = {
> 	.get_drvinfo		= bond_ethtool_get_drvinfo,
>+	.get_settings		= bond_ethtool_get_settings,
> 	.get_link		= ethtool_op_get_link,
> };
>
>-- 
>1.7.11.7

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek March 6, 2013, 7:48 p.m. UTC | #3
On Wed, Mar 06, 2013 at 08:13:06PM +0100, Veaceslav Falico wrote:
> On Wed, Mar 6, 2013 at 7:39 PM, Andy Gospodarek <andy@greyhouse.net> wrote:
> > This patch adds support for the get_settings ethtool op to the bonding
> > driver.  This was motivated by users who wanted to get the speed of the
> > bond and compare that against throughput to understand utilization.
> > The behavior before this patch was added was problematic when computing
> > line utilization after trying to get link-speed and throughput via SNMP.
> >
> > The general plan for computing link-speed was as follows:
> >
> > Mode                 Formula
> > ----                 -------
> > active-backup        speed of current active slave
> > broadcast            speed of first slave with known speed
> > all other modes      aggregate speed of all slaves with known speed
> >
> > Output from ethtool looks like this for a round-robin bond:
> >
> > Settings for bond0:
> >         Supported ports: [ ]
> >         Supported link modes:   Not reported
> >         Supported pause frame use: No
> >         Supports auto-negotiation: No
> >         Advertised link modes:  Not reported
> >         Advertised pause frame use: No
> >         Advertised auto-negotiation: No
> >         Speed: 11000Mb/s
> >         Duplex: Full
> >         Port: Twisted Pair
> >         PHYAD: 0
> >         Transceiver: internal
> >         Auto-negotiation: off
> >         MDI-X: Unknown
> >         Link detected: yes
> >
> > I tested this and verified it works as expected.  A test was also done
> > on a version backported to an older kernel and it worked well there.
> >
> > Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> > ---
> >  drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 47 insertions(+)
> >
> > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> > index 7bd068a..6e70ff0 100644
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
> >         }
> >  }
> >
> > +static int bond_ethtool_get_settings(struct net_device *bond_dev,
> > +                                    struct ethtool_cmd *ecmd)
> > +{
> > +       struct bonding *bond = netdev_priv(bond_dev);
> > +       struct slave *slave;
> > +       int i;
> > +       unsigned long speed = 0;
> > +
> > +       ecmd->speed = SPEED_UNKNOWN;
> > +       ecmd->duplex = DUPLEX_UNKNOWN;
> > +
> > +       read_lock(&bond->lock);
> > +       switch (bond->params.mode) {
> > +       case BOND_MODE_ACTIVEBACKUP:
> > +               read_lock(&bond->curr_slave_lock);
> > +               if (bond->curr_active_slave &&
> > +                   bond->curr_active_slave->speed != SPEED_UNKNOWN) {
> > +                       ecmd->speed = bond->curr_active_slave->speed;
> > +                       ecmd->duplex = bond->curr_active_slave->duplex;
> > +               }
> > +               read_unlock(&bond->curr_slave_lock);
> > +               break;
> > +       case BOND_MODE_BROADCAST:
> > +               bond_for_each_slave(bond, slave, i) {
> > +                       if (slave->speed != SPEED_UNKNOWN) {
> > +                               ecmd->speed = slave->speed;
> > +                               ecmd->duplex = slave->duplex;
> 
> Maybe it should be 'the smallest speed (and then duplex)' here? In
> broadcast we send all packets
> through all the slaves, so the effective speed/duplex would be the
> speed/duplex of the slowest slave.


My read of the transmit side of bonding is that broadcast mode will
send frames to all drivers via dev_queue_xmit without checking return
codes, so frames on faster interfaces will be send while those on slower
ones will be dropped in the driver, dropped in hardware or reqeued.

If sending UDP traffic frames would still go out at the max rate of one
of the interfaces.  This means there is a chance that a mixed bond (a
10Gbps interface with a 1Gbps interface) could both send at 100%
capacity even though one was an order of magnitude faster than another.

I considered setting to either the max speed or minimum speed, but
decided against it as I really could not decide which was better.  It
would work just fine to use broadcast mode with different speeds, but I
decided that reporting only the speed of the first interface that was up
would provide an additional way for admins to notice there might be a
problem in their configuration.

I'm happy to do either if others would like to see that and we can come
up with a good reason to use max over minimum.

> 
> > +                               break;
> > +                       }
> > +               }
> > +               break;
> > +       default:
> > +               bond_for_each_slave(bond, slave, i) {
> > +                       if (slave->speed != SPEED_UNKNOWN) {
> > +                               speed += slave->speed;
> > +                       }
> > +                       if (ecmd->duplex == DUPLEX_UNKNOWN &&
> > +                           slave->duplex != DUPLEX_UNKNOWN)
> > +                               ecmd->duplex = slave->duplex;
> > +               }
> > +               ecmd->speed = speed;
> > +       }
> > +       read_unlock(&bond->lock);
> > +       return 0;
> > +}
> > +
> >  static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
> >                                      struct ethtool_drvinfo *drvinfo)
> >  {
> > @@ -4235,6 +4281,7 @@ static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
> >
> >  static const struct ethtool_ops bond_ethtool_ops = {
> >         .get_drvinfo            = bond_ethtool_get_drvinfo,
> > +       .get_settings           = bond_ethtool_get_settings,
> >         .get_link               = ethtool_op_get_link,
> >  };
> >
> > --
> > 1.7.11.7
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 
> -- 
> Best regards,
> Veaceslav Falico
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek March 6, 2013, 8:01 p.m. UTC | #4
On Wed, Mar 06, 2013 at 11:25:12AM -0800, Jay Vosburgh wrote:
> Andy Gospodarek <andy@greyhouse.net> wrote:
> 
> >This patch adds support for the get_settings ethtool op to the bonding
> >driver.  This was motivated by users who wanted to get the speed of the
> >bond and compare that against throughput to understand utilization.
> >The behavior before this patch was added was problematic when computing
> >line utilization after trying to get link-speed and throughput via SNMP.
> >
> >The general plan for computing link-speed was as follows:
> >
> >Mode                 Formula
> >----                 -------
> >active-backup        speed of current active slave
> >broadcast            speed of first slave with known speed
> >all other modes      aggregate speed of all slaves with known speed
> 
> 	I'll just point out that the balance-tlb mode is asymmetric; it
> uses all slaves for transmission, but only one slave for reception.
> Ethtool only has a single speed for both directions, so this is probably
> the best choice, but it should still be noted.

Thanks for pointing that out.  I have a feeling there will be a v2, so
I'll try and update the changelog to reflect that.  For the record, this
same limitation exists when connecting to most switches and using
round-robin, so I didn't feel the need to differentiate possibly
asymmetric speeds.

> >Output from ethtool looks like this for a round-robin bond:
> >
> >Settings for bond0:
> >	Supported ports: [ ]
> >	Supported link modes:   Not reported
> >	Supported pause frame use: No
> >	Supports auto-negotiation: No
> >	Advertised link modes:  Not reported
> >	Advertised pause frame use: No
> >	Advertised auto-negotiation: No
> >	Speed: 11000Mb/s
> >	Duplex: Full
> >	Port: Twisted Pair
> >	PHYAD: 0
> >	Transceiver: internal
> >	Auto-negotiation: off
> >	MDI-X: Unknown
> >	Link detected: yes
> >
> >I tested this and verified it works as expected.  A test was also done
> >on a version backported to an older kernel and it worked well there.
> >
> >Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> >---
> > drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 47 insertions(+)
> >
> >diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> >index 7bd068a..6e70ff0 100644
> >--- a/drivers/net/bonding/bond_main.c
> >+++ b/drivers/net/bonding/bond_main.c
> >@@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
> > 	}
> > }
> >
> >+static int bond_ethtool_get_settings(struct net_device *bond_dev,
> >+				     struct ethtool_cmd *ecmd)
> >+{
> >+	struct bonding *bond = netdev_priv(bond_dev);
> >+	struct slave *slave;
> >+	int i;
> >+	unsigned long speed = 0;
> >+
> >+	ecmd->speed = SPEED_UNKNOWN;
> >+	ecmd->duplex = DUPLEX_UNKNOWN;
> >+
> >+	read_lock(&bond->lock);
> >+	switch (bond->params.mode) {
> >+	case BOND_MODE_ACTIVEBACKUP:
> >+		read_lock(&bond->curr_slave_lock);
> >+		if (bond->curr_active_slave &&
> >+		    bond->curr_active_slave->speed != SPEED_UNKNOWN) {
> >+			ecmd->speed = bond->curr_active_slave->speed;
> >+			ecmd->duplex = bond->curr_active_slave->duplex;
> >+		}
> >+		read_unlock(&bond->curr_slave_lock);
> >+		break;
> >+	case BOND_MODE_BROADCAST:
> >+		bond_for_each_slave(bond, slave, i) {
> >+			if (slave->speed != SPEED_UNKNOWN) {
> >+				ecmd->speed = slave->speed;
> >+				ecmd->duplex = slave->duplex;
> >+				break;
> >+			}
> >+		}
> >+		break;
> 
> 	Does anybody really use broadcast mode?  Not that I'm saying
> this is incorrect, I'm just wondering in general.
> 

I don't imagine they do, but wanted to add something for it since it
would not reallyu fall into the default case well.

> >+	default:
> >+		bond_for_each_slave(bond, slave, i) {
> >+			if (slave->speed != SPEED_UNKNOWN) {
> >+				speed += slave->speed;
> >+			}
> >+			if (ecmd->duplex == DUPLEX_UNKNOWN &&
> >+			    slave->duplex != DUPLEX_UNKNOWN)
> >+				ecmd->duplex = slave->duplex;
> 
> 	Should the calculations skip slaves that are not BOND_LINK_UP?
> If the ARP monitor is running, some slaves may be carrier up (and have
> slave->speed set), but are not actually in use by the bond, at least for
> transmission.
> 

That would be fine with me.  If you would like I can add that for a v2.
It would produce a more honest estimate of what the maximum throughput
would be at that point in time.

> 	-J
> 
> >+		}
> >+		ecmd->speed = speed;
> >+	}
> >+	read_unlock(&bond->lock);
> >+	return 0;
> >+}
> >+
> > static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
> > 				     struct ethtool_drvinfo *drvinfo)
> > {
> >@@ -4235,6 +4281,7 @@ static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
> >
> > static const struct ethtool_ops bond_ethtool_ops = {
> > 	.get_drvinfo		= bond_ethtool_get_drvinfo,
> >+	.get_settings		= bond_ethtool_get_settings,
> > 	.get_link		= ethtool_op_get_link,
> > };
> >
> >-- 
> >1.7.11.7
> 
> ---
> 	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings March 6, 2013, 8:17 p.m. UTC | #5
On Wed, 2013-03-06 at 13:39 -0500, Andy Gospodarek wrote:
> This patch adds support for the get_settings ethtool op to the bonding
> driver.  This was motivated by users who wanted to get the speed of the
> bond and compare that against throughput to understand utilization.
> The behavior before this patch was added was problematic when computing
> line utilization after trying to get link-speed and throughput via SNMP.
> 
> The general plan for computing link-speed was as follows:
> 
> Mode                 Formula
> ----                 -------
> active-backup        speed of current active slave
> broadcast            speed of first slave with known speed
> all other modes      aggregate speed of all slaves with known speed
> 
> Output from ethtool looks like this for a round-robin bond:
> 
> Settings for bond0:
> 	Supported ports: [ ]
> 	Supported link modes:   Not reported
> 	Supported pause frame use: No
> 	Supports auto-negotiation: No
> 	Advertised link modes:  Not reported
> 	Advertised pause frame use: No
> 	Advertised auto-negotiation: No
> 	Speed: 11000Mb/s
> 	Duplex: Full
> 	Port: Twisted Pair

I think port should be PORT_OTHER.  Or perhaps you could ask the current
slave, if using active-backup.  (Tricky since you can't do that while
holding the bond rwlock.)

> 	PHYAD: 0

I don't think we have a properly defined 'not applicable' value for
phy_address, though I've used 0xff in sfc.

> 	Transceiver: internal
> 	Auto-negotiation: off
> 	MDI-X: Unknown
> 	Link detected: yes
> 
> I tested this and verified it works as expected.  A test was also done
> on a version backported to an older kernel and it worked well there.
> 
> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> ---
>  drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 7bd068a..6e70ff0 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
>  	}
>  }
>  
> +static int bond_ethtool_get_settings(struct net_device *bond_dev,
> +				     struct ethtool_cmd *ecmd)
> +{
> +	struct bonding *bond = netdev_priv(bond_dev);
> +	struct slave *slave;
> +	int i;
> +	unsigned long speed = 0;
> +
> +	ecmd->speed = SPEED_UNKNOWN;
[...]

You must use ethtool_cmd_speed_set() instead of assigning just
ecmd->speed.

(bonding appears to be the first driver that could report a value >
65535.)

Ben.
Jay Vosburgh March 6, 2013, 9:46 p.m. UTC | #6
Andy Gospodarek <andy@greyhouse.net> wrote:

>On Wed, Mar 06, 2013 at 11:25:12AM -0800, Jay Vosburgh wrote:
>> Andy Gospodarek <andy@greyhouse.net> wrote:
>> 
>> >This patch adds support for the get_settings ethtool op to the bonding
>> >driver.  This was motivated by users who wanted to get the speed of the
>> >bond and compare that against throughput to understand utilization.
>> >The behavior before this patch was added was problematic when computing
>> >line utilization after trying to get link-speed and throughput via SNMP.
>> >
>> >The general plan for computing link-speed was as follows:
>> >
>> >Mode                 Formula
>> >----                 -------
>> >active-backup        speed of current active slave
>> >broadcast            speed of first slave with known speed
>> >all other modes      aggregate speed of all slaves with known speed
>> 
>> 	I'll just point out that the balance-tlb mode is asymmetric; it
>> uses all slaves for transmission, but only one slave for reception.
>> Ethtool only has a single speed for both directions, so this is probably
>> the best choice, but it should still be noted.
>
>Thanks for pointing that out.  I have a feeling there will be a v2, so
>I'll try and update the changelog to reflect that.  For the record, this
>same limitation exists when connecting to most switches and using
>round-robin, so I didn't feel the need to differentiate possibly
>asymmetric speeds.
>
>> >Output from ethtool looks like this for a round-robin bond:
>> >
>> >Settings for bond0:
>> >	Supported ports: [ ]
>> >	Supported link modes:   Not reported
>> >	Supported pause frame use: No
>> >	Supports auto-negotiation: No
>> >	Advertised link modes:  Not reported
>> >	Advertised pause frame use: No
>> >	Advertised auto-negotiation: No
>> >	Speed: 11000Mb/s
>> >	Duplex: Full
>> >	Port: Twisted Pair
>> >	PHYAD: 0
>> >	Transceiver: internal
>> >	Auto-negotiation: off
>> >	MDI-X: Unknown
>> >	Link detected: yes
>> >
>> >I tested this and verified it works as expected.  A test was also done
>> >on a version backported to an older kernel and it worked well there.
>> >
>> >Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
>> >---
>> > drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++++++++++++++++++++
>> > 1 file changed, 47 insertions(+)
>> >
>> >diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>> >index 7bd068a..6e70ff0 100644
>> >--- a/drivers/net/bonding/bond_main.c
>> >+++ b/drivers/net/bonding/bond_main.c
>> >@@ -4224,6 +4224,52 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
>> > 	}
>> > }
>> >
>> >+static int bond_ethtool_get_settings(struct net_device *bond_dev,
>> >+				     struct ethtool_cmd *ecmd)
>> >+{
>> >+	struct bonding *bond = netdev_priv(bond_dev);
>> >+	struct slave *slave;
>> >+	int i;
>> >+	unsigned long speed = 0;
>> >+
>> >+	ecmd->speed = SPEED_UNKNOWN;
>> >+	ecmd->duplex = DUPLEX_UNKNOWN;
>> >+
>> >+	read_lock(&bond->lock);
>> >+	switch (bond->params.mode) {
>> >+	case BOND_MODE_ACTIVEBACKUP:
>> >+		read_lock(&bond->curr_slave_lock);
>> >+		if (bond->curr_active_slave &&
>> >+		    bond->curr_active_slave->speed != SPEED_UNKNOWN) {
>> >+			ecmd->speed = bond->curr_active_slave->speed;
>> >+			ecmd->duplex = bond->curr_active_slave->duplex;
>> >+		}
>> >+		read_unlock(&bond->curr_slave_lock);
>> >+		break;
>> >+	case BOND_MODE_BROADCAST:
>> >+		bond_for_each_slave(bond, slave, i) {
>> >+			if (slave->speed != SPEED_UNKNOWN) {
>> >+				ecmd->speed = slave->speed;
>> >+				ecmd->duplex = slave->duplex;
>> >+				break;
>> >+			}
>> >+		}
>> >+		break;
>> 
>> 	Does anybody really use broadcast mode?  Not that I'm saying
>> this is incorrect, I'm just wondering in general.
>> 
>
>I don't imagine they do, but wanted to add something for it since it
>would not reallyu fall into the default case well.
>
>> >+	default:
>> >+		bond_for_each_slave(bond, slave, i) {
>> >+			if (slave->speed != SPEED_UNKNOWN) {
>> >+				speed += slave->speed;
>> >+			}
>> >+			if (ecmd->duplex == DUPLEX_UNKNOWN &&
>> >+			    slave->duplex != DUPLEX_UNKNOWN)
>> >+				ecmd->duplex = slave->duplex;
>> 
>> 	Should the calculations skip slaves that are not BOND_LINK_UP?
>> If the ARP monitor is running, some slaves may be carrier up (and have
>> slave->speed set), but are not actually in use by the bond, at least for
>> transmission.
>> 
>
>That would be fine with me.  If you would like I can add that for a v2.
>It would produce a more honest estimate of what the maximum throughput
>would be at that point in time.

	Yes, I think so; it's going to be an estimate for any of the
load balance modes, but it ought to be as close as is reasonable to what
kind of throughput would be expected.

	I also think it might be odd if it were possible for a bond to
simultaneously show as carrier down, but speed as something very high.

	-J


>> >+		}
>> >+		ecmd->speed = speed;
>> >+	}
>> >+	read_unlock(&bond->lock);
>> >+	return 0;
>> >+}
>> >+
>> > static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
>> > 				     struct ethtool_drvinfo *drvinfo)
>> > {
>> >@@ -4235,6 +4281,7 @@ static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
>> >
>> > static const struct ethtool_ops bond_ethtool_ops = {
>> > 	.get_drvinfo		= bond_ethtool_get_drvinfo,
>> >+	.get_settings		= bond_ethtool_get_settings,
>> > 	.get_link		= ethtool_op_get_link,
>> > };
>> >
>> >-- 
>> >1.7.11.7
>> 
>> ---
>> 	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glen Turner March 7, 2013, 12:11 p.m. UTC | #7
Hi Andy,

How does this relate to the bandwidth metric for an interface as seen by routing daemons?

An interface bandwidth as seen by routing daemons cannot change once carrier is up. Otherwise flaps of a link within a bond will cause routing protocol flaps; which they do not do at the moment, and which would be deeply, deeply undesirable.

I'd just add the maximum possible bandwidth of the interfaces, rather than the negotiated value. That's enough to get SNMP graphs working reasonably. It's not clear that a SNMP grapher will have reasonable behaviour with a changing bandwidth -- you'll end up with graphs where absolute throughput appears to fall, but what has happened is that a link has come online and utilisation has fallen. That's not a useful graph, as it's very much the "shape" of the traffic which is useful.

Checked IOS and JUNOS. In the absence of nerd knobs, both report the sum of the maximum possible interface bandwidths. So that's probably the behaviour SNMP graphing tools expect in any case.

Cheers, glen--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek March 8, 2013, 2:57 p.m. UTC | #8
On Wed, Mar 06, 2013 at 08:17:17PM +0000, Ben Hutchings wrote:
> On Wed, 2013-03-06 at 13:39 -0500, Andy Gospodarek wrote:
> > This patch adds support for the get_settings ethtool op to the bonding
> > driver.  This was motivated by users who wanted to get the speed of the
> > bond and compare that against throughput to understand utilization.
> > The behavior before this patch was added was problematic when computing
> > line utilization after trying to get link-speed and throughput via SNMP.
> > 
> > The general plan for computing link-speed was as follows:
> > 
> > Mode                 Formula
> > ----                 -------
> > active-backup        speed of current active slave
> > broadcast            speed of first slave with known speed
> > all other modes      aggregate speed of all slaves with known speed
> > 
> > Output from ethtool looks like this for a round-robin bond:
> > 
> > Settings for bond0:
> > 	Supported ports: [ ]
> > 	Supported link modes:   Not reported
> > 	Supported pause frame use: No
> > 	Supports auto-negotiation: No
> > 	Advertised link modes:  Not reported
> > 	Advertised pause frame use: No
> > 	Advertised auto-negotiation: No
> > 	Speed: 11000Mb/s
> > 	Duplex: Full
> > 	Port: Twisted Pair
> 
> I think port should be PORT_OTHER.  Or perhaps you could ask the current
> slave, if using active-backup.  (Tricky since you can't do that while
> holding the bond rwlock.)
> 
> > 	PHYAD: 0
> 
> I don't think we have a properly defined 'not applicable' value for
> phy_address, though I've used 0xff in sfc.
> 
[...]
> 
> You must use ethtool_cmd_speed_set() instead of assigning just
> ecmd->speed.
> 
> (bonding appears to be the first driver that could report a value >
> 65535.)
> 

Thanks for the review, Ben.  I'll take a look at these suggestions for v2.

> 
> -- 
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek March 8, 2013, 3:35 p.m. UTC | #9
On Thu, Mar 07, 2013 at 10:41:57PM +1030, Glen Turner wrote:
> Hi Andy,
> 
> How does this relate to the bandwidth metric for an interface as seen by routing daemons?
> 
> An interface bandwidth as seen by routing daemons cannot change once carrier is up. Otherwise flaps of a link within a bond will cause routing protocol flaps; which they do not do at the moment, and which would be deeply, deeply undesirable.
> 

That is a great point.  Are you saying that quagga or other
outside-the-kernel routing daemons actually query ethtool settings to
determine link speed and weight?  How do they handle interfaces like
bonds and vlans that do not return a speed?

> I'd just add the maximum possible bandwidth of the interfaces, rather than the negotiated value. That's enough to get SNMP graphs working reasonably. It's not clear that a SNMP grapher will have reasonable behaviour with a changing bandwidth -- you'll end up with graphs where absolute throughput appears to fall, but what has happened is that a link has come online and utilisation has fallen. That's not a useful graph, as it's very much the "shape" of the traffic which is useful.
> 
> Checked IOS and JUNOS. In the absence of nerd knobs, both report the sum of the maximum possible interface bandwidths. So that's probably the behaviour SNMP graphing tools expect in any case.
> 

Do they look at the maximum capabilities of the interfaces to determine
this maximum speed?  Right now bonding just queries the speed that is
actually connected.  I'm not sure I want to get into the business of
reporting possible maximum bandwidth as that seems invalid -- even when
considering the routing case.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glen Turner March 14, 2013, 6:57 a.m. UTC | #10
On 09/03/2013, at 2:05 AM, Andy Gospodarek <andy@greyhouse.net> wrote:

> 
> That is a great point.  Are you saying that quagga or other
> outside-the-kernel routing daemons actually query ethtool settings to
> determine link speed and weight?

Wasn't sure. I was hinting if it did happen it would be serious, as bonding is commonly used in network designs to reduce L3 flapping.

I downloaded Quagga for a look.

Quagga uses the NetLink protocol to talk with the kernel, and it doesn't seem to be possible to retrieve an interface's bandwidth using that. That's possibly because bandwidth is a hairy concept (eg, you can argue that it's more a attribute of links than interfaces). Netlink does return a metric for links, but not a bandwidth.

So Quagga can retrieve each link's metric, that metric is then interpreted by the various routing daemons. But in Linux metrics are usually 1, not based on link speeds like they are with routers.

So Quagga in practice relies on manual setting of bandwidth in zebra.conf:

  interface eth0
    ! 1000Base-T
    bandwidth 1000000

If the interface bandwidth is nonzero then Quagga uses that (adjusted by "auto-cost reference-bandwidth") to set the link metric used by the routing protocol. Otherwise it seems to adjust the Linux metric (I didn't check how or why) and use that.

So, at least in the Quagga instance, ethtool's reported bandwidth isn't used for routing.


The more I think about the look of the graphs the more I think that they are not going to work as you hope. The graphed value shouldn't suddenly halve just because a link has come back online.

I really think you'll be happier reporting the addition of the maximum bandwidth of all the interfaces in the bond, whatever their carrier state or negotiated value. Maybe you could change the bonding driver so that if this results is poor scaling then you can set a bond's speed with "ethtool -s bond0 400" and "ethtool bond0" will report that. After all it's not like setting a bond's speed could have any physical meaning.

Best wishes, glen
diff mbox

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 7bd068a..6e70ff0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4224,6 +4224,52 @@  void bond_set_mode_ops(struct bonding *bond, int mode)
 	}
 }
 
+static int bond_ethtool_get_settings(struct net_device *bond_dev,
+				     struct ethtool_cmd *ecmd)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	int i;
+	unsigned long speed = 0;
+
+	ecmd->speed = SPEED_UNKNOWN;
+	ecmd->duplex = DUPLEX_UNKNOWN;
+
+	read_lock(&bond->lock);
+	switch (bond->params.mode) {
+	case BOND_MODE_ACTIVEBACKUP:
+		read_lock(&bond->curr_slave_lock);
+		if (bond->curr_active_slave &&
+		    bond->curr_active_slave->speed != SPEED_UNKNOWN) {
+			ecmd->speed = bond->curr_active_slave->speed;
+			ecmd->duplex = bond->curr_active_slave->duplex;
+		}
+		read_unlock(&bond->curr_slave_lock);
+		break;
+	case BOND_MODE_BROADCAST:
+		bond_for_each_slave(bond, slave, i) {
+			if (slave->speed != SPEED_UNKNOWN) {
+				ecmd->speed = slave->speed;
+				ecmd->duplex = slave->duplex;
+				break;
+			}
+		}
+		break;
+	default:
+		bond_for_each_slave(bond, slave, i) {
+			if (slave->speed != SPEED_UNKNOWN) {
+				speed += slave->speed;
+			}
+			if (ecmd->duplex == DUPLEX_UNKNOWN &&
+			    slave->duplex != DUPLEX_UNKNOWN)
+				ecmd->duplex = slave->duplex;
+		}
+		ecmd->speed = speed;
+	}
+	read_unlock(&bond->lock);
+	return 0;
+}
+
 static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
 				     struct ethtool_drvinfo *drvinfo)
 {
@@ -4235,6 +4281,7 @@  static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
 
 static const struct ethtool_ops bond_ethtool_ops = {
 	.get_drvinfo		= bond_ethtool_get_drvinfo,
+	.get_settings		= bond_ethtool_get_settings,
 	.get_link		= ethtool_op_get_link,
 };