diff mbox

bonding: If IP route look-up to send an ARP fails, mark in bonding structure as no ARP sent.

Message ID 528D5980.3040309@oracle.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Rama Nichanamatlu Nov. 21, 2013, 12:53 a.m. UTC
During the creation of VLAN's atop bonding the underlying interfaces are 
made part of VLAN's, and at the same bonding driver gets aware that 
VLAN's exists above it and hence would consult IP routing for every ARP 
to  be sent to determine the route which tells bonding driver the 
correct VLAN tag to attach to the outgoing ARP packet. But, during the 
VLAN creation when vlan driver puts the underlying interface into 
default vlan and then actual vlan, in-between this if bonding driver 
consults the IP for a route, IP fails to provide a correct route and 
upon which bonding driver drops the ARP packet. ARP monitor when it
comes around next time, sees no ARP response and fails-over to the next 
available slave. Consulting for a IP route, ip_route_output(),happens in 
bond_arp_send_all().

To prevent this false fail-over, when bonding driver fails to send an 
ARP out it marks in its private structure, bonding{},  not to expect an 
ARP response, when ARP monitor comes around next time ARP sending will 
be tried again.

Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan 
intf. All virtual interfaces created at boot time.

Orabug: 17172660
Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
---
  drivers/net/bonding/bond_main.c | 13 ++++++++-----
  drivers/net/bonding/bonding.h   |  1 +
  2 files changed, 9 insertions(+), 5 deletions(-)

Comments

Ding Tianhong Nov. 21, 2013, 1:10 a.m. UTC | #1
On 2013/11/21 8:53, rama nichanamatlu wrote:
> During the creation of VLAN's atop bonding the underlying interfaces are made part of VLAN's, and at the same bonding driver gets aware that VLAN's exists above it and hence would consult IP routing for every ARP to  be sent to determine the route which tells bonding driver the correct VLAN tag to attach to the outgoing ARP packet. But, during the VLAN creation when vlan driver puts the underlying interface into default vlan and then actual vlan, in-between this if bonding driver consults the IP for a route, IP fails to provide a correct route and upon which bonding driver drops the ARP packet. ARP monitor when it
> comes around next time, sees no ARP response and fails-over to the next available slave. Consulting for a IP route, ip_route_output(),happens in bond_arp_send_all().
> 
> To prevent this false fail-over, when bonding driver fails to send an ARP out it marks in its private structure, bonding{},  not to expect an ARP response, when ARP monitor comes around next time ARP sending will be tried again.
> 
> Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan intf. All virtual interfaces created at boot time.
> 

please reorganize the changelog, it is too log for a line, please make it in 80 chats.

> Orabug: 17172660
> Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
> Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
> ---
>  drivers/net/bonding/bond_main.c | 13 ++++++++-----
>  drivers/net/bonding/bonding.h   |  1 +
>  2 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index dde6b4a..d475161 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding *bond, __be32 ip)
>   * switches in VLAN mode (especially if ports are configured as
>   * "native" to a VLAN) might not pass non-tagged frames.
>   */
> -static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id)
> +static void bond_arp_send(struct bonding *bond, struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short vlan_id)

no need to add bond, the bond could be get from slave, see slave->bond

Ding
>  {
>      struct sk_buff *skb;
>  @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_
>          }
>      }
>      arp_xmit(skb);
> +    bond->arp_sent=true;
>  }
>   @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>          pr_debug("basa: target %x\n", targets[i]);
>          if (!bond->vlgrp) {
>              pr_debug("basa: empty vlan: arp_send\n");
> -            bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
> +            bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
>                        bond->master_ip, 0);
>              continue;
>          }
> @@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>          if (rt->dst.dev == bond->dev) {
>              ip_rt_put(rt);
>              pr_debug("basa: rtdev == bond->dev: arp_send\n");
> -            bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
> +            bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
>                        bond->master_ip, 0);
>              continue;
>          }
> @@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
>           if (vlan_id) {
>              ip_rt_put(rt);
> -            bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
> +            bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
>                        vlan->vlan_ip, vlan_id);
>              continue;
>          }
> @@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct work_struct *work)
>       should_notify_peers = bond_should_notify_peers(bond);
>  -    if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
> +    if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) {
>          read_unlock(&bond->lock);
>          rtnl_lock();
>          read_lock(&bond->lock);
> @@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct work_struct *work)
>          read_lock(&bond->lock);
>      }
>  +    bond->arp_sent=false;
>      bond_ab_arp_probe(bond);
>   re_arm:
> @@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev)
>       bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM);
>      bond_dev->features |= bond_dev->hw_features;
> +    bond->arp_sent=false;
>  }
>   static void bond_work_cancel_all(struct bonding *bond)
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index e9a3c56..3878bbd 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -253,6 +253,7 @@ struct bonding {
>      /* debugging suport via debugfs */
>      struct     dentry *debug_dir;
>  #endif /* CONFIG_DEBUG_FS */
> +        bool arp_sent;
>  };
>   #define bond_slave_get_rcu(dev) \


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh Nov. 21, 2013, 1:18 a.m. UTC | #2
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:

>During the creation of VLAN's atop bonding the underlying interfaces are
>made part of VLAN's, and at the same bonding driver gets aware that VLAN's
>exists above it and hence would consult IP routing for every ARP to  be
>sent to determine the route which tells bonding driver the correct VLAN
>tag to attach to the outgoing ARP packet. But, during the VLAN creation
>when vlan driver puts the underlying interface into default vlan and then
>actual vlan, in-between this if bonding driver consults the IP for a
>route, IP fails to provide a correct route and upon which bonding driver
>drops the ARP packet. ARP monitor when it
>comes around next time, sees no ARP response and fails-over to the next
>available slave. Consulting for a IP route, ip_route_output(),happens in
>bond_arp_send_all().
>
>To prevent this false fail-over, when bonding driver fails to send an ARP
>out it marks in its private structure, bonding{},  not to expect an ARP
>response, when ARP monitor comes around next time ARP sending will be
>tried again.
>
>Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan
>intf. All virtual interfaces created at boot time.

	First, this patch appears to be for an older kernel, as the
current mainline code is substantially different (e.g., master_ip is no
longer used).

	Second, won't this methodology mask legitimate failures, such as
when a single arp_ip_target specifies a destination that is not ever
reachable?  I.e., would specifying a permanently unreachable IP address
as the arp_ip_target cause all slaves to always stay up (because no ARPs
will ever be sent), even if no ARP replies are ever received?

	-J

>Orabug: 17172660
>Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
>Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
>---
> drivers/net/bonding/bond_main.c | 13 ++++++++-----
> drivers/net/bonding/bonding.h   |  1 +
> 2 files changed, 9 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c
>b/drivers/net/bonding/bond_main.c
>index dde6b4a..d475161 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding *bond,
>__be32 ip)
>  * switches in VLAN mode (especially if ports are configured as
>  * "native" to a VLAN) might not pass non-tagged frames.
>  */
>-static void bond_arp_send(struct net_device *slave_dev, int arp_op,
>__be32 dest_ip, __be32 src_ip, unsigned short vlan_id)
>+static void bond_arp_send(struct bonding *bond, struct net_device
>*slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short
>vlan_id)
> {
> 	struct sk_buff *skb;
> @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device
>*slave_dev, int arp_op, __be32 dest_
> 		}
> 	}
> 	arp_xmit(skb);
>+	bond->arp_sent=true;
> }
>  @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding
>*bond, struct slave *slave)
> 		pr_debug("basa: target %x\n", targets[i]);
> 		if (!bond->vlgrp) {
> 			pr_debug("basa: empty vlan: arp_send\n");
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      bond->master_ip, 0);
> 			continue;
> 		}
>@@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding *bond,
>struct slave *slave)
> 		if (rt->dst.dev == bond->dev) {
> 			ip_rt_put(rt);
> 			pr_debug("basa: rtdev == bond->dev: arp_send\n");
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      bond->master_ip, 0);
> 			continue;
> 		}
>@@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding *bond,
>struct slave *slave)
>  		if (vlan_id) {
> 			ip_rt_put(rt);
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      vlan->vlan_ip, vlan_id);
> 			continue;
> 		}
>@@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct work_struct
>*work)
>  	should_notify_peers = bond_should_notify_peers(bond);
> -	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
>+	if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) {
> 		read_unlock(&bond->lock);
> 		rtnl_lock();
> 		read_lock(&bond->lock);
>@@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct work_struct
>*work)
> 		read_lock(&bond->lock);
> 	}
> +	bond->arp_sent=false;
> 	bond_ab_arp_probe(bond);
>  re_arm:
>@@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev)
>  	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM);
> 	bond_dev->features |= bond_dev->hw_features;
>+	bond->arp_sent=false;
> }
>  static void bond_work_cancel_all(struct bonding *bond)
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index e9a3c56..3878bbd 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -253,6 +253,7 @@ struct bonding {
> 	/* debugging suport via debugfs */
> 	struct	 dentry *debug_dir;
> #endif /* CONFIG_DEBUG_FS */
>+        bool arp_sent;
> };
>  #define bond_slave_get_rcu(dev) \
>-- 
>1.8.2.1

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rama Nichanamatlu Nov. 21, 2013, 2:23 a.m. UTC | #3
On 11/20/2013 5:18 PM, Jay Vosburgh wrote:
> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:
> 
>> During the creation of VLAN's atop bonding the underlying interfaces are
>> made part of VLAN's, and at the same bonding driver gets aware that VLAN's
>> exists above it and hence would consult IP routing for every ARP to  be
>> sent to determine the route which tells bonding driver the correct VLAN
>> tag to attach to the outgoing ARP packet. But, during the VLAN creation
>> when vlan driver puts the underlying interface into default vlan and then
>> actual vlan, in-between this if bonding driver consults the IP for a
>> route, IP fails to provide a correct route and upon which bonding driver
>> drops the ARP packet. ARP monitor when it
>> comes around next time, sees no ARP response and fails-over to the next
>> available slave. Consulting for a IP route, ip_route_output(),happens in
>> bond_arp_send_all().
>>
>> To prevent this false fail-over, when bonding driver fails to send an ARP
>> out it marks in its private structure, bonding{},  not to expect an ARP
>> response, when ARP monitor comes around next time ARP sending will be
>> tried again.
>>
>> Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan
>> intf. All virtual interfaces created at boot time.
> 
> 	First, this patch appears to be for an older kernel, as the
> current mainline code is substantially different (e.g., master_ip is no
> longer used).
> 
> 	Second, won't this methodology mask legitimate failures, such as
> when a single arp_ip_target specifies a destination that is not ever
> reachable?  I.e., would specifying a permanently unreachable IP address
> as the arp_ip_target cause all slaves to always stay up (because no ARPs
> will ever be sent), even if no ARP replies are ever received?
> 
> 	-J
> 
Thank U.
I agree with your rationale. Would keep a slave falsely up but traffic
might flow. And true that, it is not what we are looking for.
We can try a different approach too, which we used to fix a false
fail-over in MTU changing case where the device interface takes time to
change the device MTU. And in the mean time bonding was failing over.
What we did to fix was to stop the ARP monitoring, bond_change_mtu(),
and restart it when NETDEV_CHANGE from slave is handled,
bond_slave_netdev_event(). Not sure if this can used for vlan case, as
mtu thing is event driven.


>> Orabug: 17172660
>> Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
>> Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rama Nichanamatlu Nov. 21, 2013, 6:01 a.m. UTC | #4
Thought will write to you again Jay.

On 11/20/2013 5:18 PM, Jay Vosburgh wrote:
> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:
> 
>> During the creation of VLAN's atop bonding the underlying interfaces are
>> made part of VLAN's, and at the same bonding driver gets aware that VLAN's
>> exists above it and hence would consult IP routing for every ARP to  be
>> sent to determine the route which tells bonding driver the correct VLAN
>> tag to attach to the outgoing ARP packet. But, during the VLAN creation
>> when vlan driver puts the underlying interface into default vlan and then
>> actual vlan, in-between this if bonding driver consults the IP for a
>> route, IP fails to provide a correct route and upon which bonding driver
>> drops the ARP packet. ARP monitor when it
>> comes around next time, sees no ARP response and fails-over to the next
>> available slave. Consulting for a IP route, ip_route_output(),happens in
>> bond_arp_send_all().
>>
>> To prevent this false fail-over, when bonding driver fails to send an ARP
>> out it marks in its private structure, bonding{},  not to expect an ARP
>> response, when ARP monitor comes around next time ARP sending will be
>> tried again.
>>
>> Extensively tested in a VM environment; sr-iov intf->bonding intf->vlan
>> intf. All virtual interfaces created at boot time.
> 
> 	First, this patch appears to be for an older kernel, as the
> current mainline code is substantially different (e.g., master_ip is no
> longer used).
> 
> 	Second, won't this methodology mask legitimate failures, such as
> when a single arp_ip_target specifies a destination that is not ever
> reachable?  I.e., would specifying a permanently unreachable IP address
> as the arp_ip_target cause all slaves to always stay up (because no ARPs
> will ever be sent), even if no ARP replies are ever received?
> 
> 
An unreachable arp_target, a.b.c.d in a way that ip_route_output() fails
 as it cant even know which local interface to broadcast arp out.
Is that what you mean? Either with this change or not, bonding interface
is dead.Right? Without this change, it flip-flops (cycling) between
slaves and with this change stays on one slave.

>	-J
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Veaceslav Falico Nov. 21, 2013, 11:10 a.m. UTC | #5
On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote:
>During the creation of VLAN's atop bonding the underlying interfaces 
>are made part of VLAN's, and at the same bonding driver gets aware 
>that VLAN's exists above it and hence would consult IP routing for 
>every ARP to  be sent to determine the route which tells bonding 
>driver the correct VLAN tag to attach to the outgoing ARP packet. 
>But, during the VLAN creation when vlan driver puts the underlying 
>interface into default vlan and then actual vlan, in-between this if 
>bonding driver consults the IP for a route, IP fails to provide a 
>correct route and upon which bonding driver drops the ARP packet. ARP 
>monitor when it
>comes around next time, sees no ARP response and fails-over to the 
>next available slave. Consulting for a IP route, 
>ip_route_output(),happens in bond_arp_send_all().

bonding works as expected - nothing to fix here. And even as a
workaround/hack - I'm not sure we need that to suppress one failover *only*
when vlan is added on top.

>
>To prevent this false fail-over, when bonding driver fails to send an 
>ARP out it marks in its private structure, bonding{},  not to expect 
>an ARP response, when ARP monitor comes around next time ARP sending 
>will be tried again.
>
>Extensively tested in a VM environment; sr-iov intf->bonding 
>intf->vlan intf. All virtual interfaces created at boot time.
>
>Orabug: 17172660
>Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
>Signed-off-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
>---
> drivers/net/bonding/bond_main.c | 13 ++++++++-----
> drivers/net/bonding/bonding.h   |  1 +
> 2 files changed, 9 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c 
>b/drivers/net/bonding/bond_main.c
>index dde6b4a..d475161 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2661,7 +2661,7 @@ static int bond_has_this_ip(struct bonding 
>*bond, __be32 ip)
>  * switches in VLAN mode (especially if ports are configured as
>  * "native" to a VLAN) might not pass non-tagged frames.
>  */
>-static void bond_arp_send(struct net_device *slave_dev, int arp_op, 
>__be32 dest_ip, __be32 src_ip, unsigned short vlan_id)
>+static void bond_arp_send(struct bonding *bond, struct net_device 
>*slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short 
>vlan_id)
> {
> 	struct sk_buff *skb;
> @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device 
>*slave_dev, int arp_op, __be32 dest_
> 		}
> 	}
> 	arp_xmit(skb);
>+	bond->arp_sent=true;
> }
>  @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding 
>*bond, struct slave *slave)
> 		pr_debug("basa: target %x\n", targets[i]);
> 		if (!bond->vlgrp) {
> 			pr_debug("basa: empty vlan: arp_send\n");
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      bond->master_ip, 0);
> 			continue;
> 		}
>@@ -2726,7 +2727,7 @@ static void bond_arp_send_all(struct bonding 
>*bond, struct slave *slave)
> 		if (rt->dst.dev == bond->dev) {
> 			ip_rt_put(rt);
> 			pr_debug("basa: rtdev == bond->dev: arp_send\n");
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      bond->master_ip, 0);
> 			continue;
> 		}
>@@ -2744,7 +2745,7 @@ static void bond_arp_send_all(struct bonding 
>*bond, struct slave *slave)
>  		if (vlan_id) {
> 			ip_rt_put(rt);
>-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
>+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
> 				      vlan->vlan_ip, vlan_id);
> 			continue;
> 		}
>@@ -3206,7 +3207,7 @@ void bond_activebackup_arp_mon(struct 
>work_struct *work)
>  	should_notify_peers = bond_should_notify_peers(bond);
> -	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
>+	if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) {
> 		read_unlock(&bond->lock);
> 		rtnl_lock();
> 		read_lock(&bond->lock);
>@@ -3218,6 +3219,7 @@ void bond_activebackup_arp_mon(struct 
>work_struct *work)
> 		read_lock(&bond->lock);
> 	}
> +	bond->arp_sent=false;
> 	bond_ab_arp_probe(bond);
>  re_arm:
>@@ -4425,6 +4427,7 @@ static void bond_setup(struct net_device *bond_dev)
>  	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM);
> 	bond_dev->features |= bond_dev->hw_features;
>+	bond->arp_sent=false;
> }
>  static void bond_work_cancel_all(struct bonding *bond)
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index e9a3c56..3878bbd 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -253,6 +253,7 @@ struct bonding {
> 	/* debugging suport via debugfs */
> 	struct	 dentry *debug_dir;
> #endif /* CONFIG_DEBUG_FS */
>+        bool arp_sent;
> };
>  #define bond_slave_get_rcu(dev) \
>-- 
>1.8.2.1
>


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rama Nichanamatlu Nov. 21, 2013, 8:34 p.m. UTC | #6
On 11/21/2013 3:10 AM, Veaceslav Falico wrote:
> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote:
>> During the creation of VLAN's atop bonding the underlying interfaces
>> are made part of VLAN's, and at the same bonding driver gets aware
>> that VLAN's exists above it and hence would consult IP routing for
>> every ARP to  be sent to determine the route which tells bonding
>> driver the correct VLAN tag to attach to the outgoing ARP packet. But,
>> during the VLAN creation when vlan driver puts the underlying
>> interface into default vlan and then actual vlan, in-between this if
>> bonding driver consults the IP for a route, IP fails to provide a
>> correct route and upon which bonding driver drops the ARP packet. ARP
>> monitor when it
>> comes around next time, sees no ARP response and fails-over to the
>> next available slave. Consulting for a IP route,
>> ip_route_output(),happens in bond_arp_send_all().
> 
> bonding works as expected - nothing to fix here. And even as a
> workaround/hack - I'm not sure we need that to suppress one failover *only*
> when vlan is added on top.
> 
>>
Thank U.
With *out* this change our systems failed system testing, to
consistently be on designated primary interface on *every* single
reboot. With this change the behavior was as expected even after a few
thousand reboots & System testing could move to next level catching an
another bug in sr-iov :). And Without, the outcome was less predictable
after a reboot and bonding was on a different slave each time.
-Rama
>> To prevent this false fail-over, when bonding driver fails to send an
>> ARP out it marks in its private structure, bonding{},  not to expect
>> an ARP response, when ARP monitor comes around next time ARP sending
>> will be tried again.
>>
>> Extensively tested in a VM environment; sr-iov intf->bonding
>> intf->vlan intf. All virtual interfaces created at boot time.
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh Nov. 21, 2013, 9:12 p.m. UTC | #7
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:

>On 11/21/2013 3:10 AM, Veaceslav Falico wrote:
>> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote:
>>> During the creation of VLAN's atop bonding the underlying interfaces
>>> are made part of VLAN's, and at the same bonding driver gets aware
>>> that VLAN's exists above it and hence would consult IP routing for
>>> every ARP to  be sent to determine the route which tells bonding
>>> driver the correct VLAN tag to attach to the outgoing ARP packet. But,
>>> during the VLAN creation when vlan driver puts the underlying
>>> interface into default vlan and then actual vlan, in-between this if
>>> bonding driver consults the IP for a route, IP fails to provide a
>>> correct route and upon which bonding driver drops the ARP packet. ARP
>>> monitor when it
>>> comes around next time, sees no ARP response and fails-over to the
>>> next available slave. Consulting for a IP route,
>>> ip_route_output(),happens in bond_arp_send_all().
>> 
>> bonding works as expected - nothing to fix here. And even as a
>> workaround/hack - I'm not sure we need that to suppress one failover *only*
>> when vlan is added on top.
>> 
>>>
>Thank U.
>With *out* this change our systems failed system testing, to
>consistently be on designated primary interface on *every* single
>reboot. With this change the behavior was as expected even after a few
>thousand reboots & System testing could move to next level catching an
>another bug in sr-iov :). And Without, the outcome was less predictable
>after a reboot and bonding was on a different slave each time.
>-Rama

	By "designated primary" you mean the bonding primary option,
correct?  If not, does setting primary resolve the problem?  If so,
you're saying that during the bringup, bonding would end up with a
non-primary slave as the active slave?  Or that there would be a
failover / failback cycle during the bringup due to the lack of VLAN
availability?

	There is already a mechanism in bond_ab_arp_inspect() to give
new slaves a grace period before applying link failures:

                /*
                 * Give slaves 2*delta after being enslaved or made
                 * active.  This avoids bouncing, as the last receive
                 * times need a full ARP monitor cycle to be updated.
                 */
                if (bond_time_in_interval(bond, slave->jiffies, 2))
                        continue;

	If you extend that grace period (the "2", which is in units of
the arp_interval), does the problem resolve itself, or is the time
window here longer than that?

	How is the configuration of bonding and the VLANs taking place?

	I don't think this patch is suitable (because it can mask
legitimate failures), but I'm not entirely sure I understand the details
of the problem.  Is this simply that the arp_ip_target is specified as a
VLAN destination signficantly before (meaning perhaps many seconds of
real time) the VLAN is configured above bonding, or is it some kind of
race condition in the VLAN code?

	-J

>>> To prevent this false fail-over, when bonding driver fails to send an
>>> ARP out it marks in its private structure, bonding{},  not to expect
>>> an ARP response, when ARP monitor comes around next time ARP sending
>>> will be tried again.
>>>
>>> Extensively tested in a VM environment; sr-iov intf->bonding
>>> intf->vlan intf. All virtual interfaces created at boot time.

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rama Nichanamatlu Nov. 22, 2013, 12:34 a.m. UTC | #8
On 11/21/2013 1:12 PM, Jay Vosburgh wrote:
> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:
> 
>> On 11/21/2013 3:10 AM, Veaceslav Falico wrote:
>>> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote:
>>>> During the creation of VLAN's atop bonding the underlying interfaces
>>>> are made part of VLAN's, and at the same bonding driver gets aware
>>>> that VLAN's exists above it and hence would consult IP routing for
>>>> every ARP to  be sent to determine the route which tells bonding
>>>> driver the correct VLAN tag to attach to the outgoing ARP packet. But,
>>>> during the VLAN creation when vlan driver puts the underlying
>>>> interface into default vlan and then actual vlan, in-between this if
>>>> bonding driver consults the IP for a route, IP fails to provide a
>>>> correct route and upon which bonding driver drops the ARP packet. ARP
>>>> monitor when it
>>>> comes around next time, sees no ARP response and fails-over to the
>>>> next available slave. Consulting for a IP route,
>>>> ip_route_output(),happens in bond_arp_send_all().
>>>
>>> bonding works as expected - nothing to fix here. And even as a
>>> workaround/hack - I'm not sure we need that to suppress one failover *only*
>>> when vlan is added on top.
>>>
>>>>
>> Thank U.
>> With *out* this change our systems failed system testing, to
>> consistently be on designated primary interface on *every* single
>> reboot. With this change the behavior was as expected even after a few
>> thousand reboots & System testing could move to next level catching an
>> another bug in sr-iov :). And Without, the outcome was less predictable
>> after a reboot and bonding was on a different slave each time.
>> -Rama
> 
> 	By "designated primary" you mean the bonding primary option,
> correct?  
Yes correct. Bonding primary param is set.
ex: primary=eth1 and primary_reselect=2.
Hence it is expected to be on primary on every reboot.
-Rama
>If not, 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh Nov. 22, 2013, 2:43 a.m. UTC | #9
rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:

>On 11/21/2013 1:12 PM, Jay Vosburgh wrote:
>> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:
>> 
>>> On 11/21/2013 3:10 AM, Veaceslav Falico wrote:
>>>> On Wed, Nov 20, 2013 at 04:53:20PM -0800, rama nichanamatlu wrote:
>>>>> During the creation of VLAN's atop bonding the underlying interfaces
>>>>> are made part of VLAN's, and at the same bonding driver gets aware
>>>>> that VLAN's exists above it and hence would consult IP routing for
>>>>> every ARP to  be sent to determine the route which tells bonding
>>>>> driver the correct VLAN tag to attach to the outgoing ARP packet. But,
>>>>> during the VLAN creation when vlan driver puts the underlying
>>>>> interface into default vlan and then actual vlan, in-between this if
>>>>> bonding driver consults the IP for a route, IP fails to provide a
>>>>> correct route and upon which bonding driver drops the ARP packet. ARP
>>>>> monitor when it
>>>>> comes around next time, sees no ARP response and fails-over to the
>>>>> next available slave. Consulting for a IP route,
>>>>> ip_route_output(),happens in bond_arp_send_all().
>>>>
>>>> bonding works as expected - nothing to fix here. And even as a
>>>> workaround/hack - I'm not sure we need that to suppress one failover *only*
>>>> when vlan is added on top.
>>>>
>>>>>
>>> Thank U.
>>> With *out* this change our systems failed system testing, to
>>> consistently be on designated primary interface on *every* single
>>> reboot. With this change the behavior was as expected even after a few
>>> thousand reboots & System testing could move to next level catching an
>>> another bug in sr-iov :). And Without, the outcome was less predictable
>>> after a reboot and bonding was on a different slave each time.
>>> -Rama
>> 
>> 	By "designated primary" you mean the bonding primary option,
>> correct?  
>Yes correct. Bonding primary param is set.
>ex: primary=eth1 and primary_reselect=2.
>Hence it is expected to be on primary on every reboot.

	If I set up a basic bonding configuration like:

[ eth3, eth4 ] -> bond0 -> bond0.66, with primary=eth3 primary_reselect=2

	Then look at dmesg, I see this sequence:

	The bond is set up first, with an arp_ip_target on a VLAN
destination.  The slaves are added to the bond.

	The VLAN interface is configured above the bond, and brought up.

	The slaves become link up after autonegotiation, the ARP monitor
commences, and eth3 is made the active slave.  Even if eth4 is set by
the bond to be "link status up," eth3 becomes the active slave when it
becomes "link status up."

	What network device are you using for the slaves?  Are they
virtualized devices of some kind?  My suspicion is that Ethernet
autonegotiation either does not take place or occurs so quickly that the
slaves are carrier up before the VLAN is even added.

	Can you check your dmesg output for the sequence of events?  In
my test, I do not see the slaves go "NIC Link is Up 1000 Mbps Full
Duplex" until about 3 seconds after the VLAN interface has been
configured.


	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rama Nichanamatlu Nov. 22, 2013, 8:28 a.m. UTC | #10
On 11/21/2013 6:43 PM, Jay Vosburgh wrote:
> rama nichanamatlu <rama.nichanamatlu@oracle.com> wrote:

>> Yes correct. Bonding primary param is set.
>> ex: primary=eth1 and primary_reselect=2.
>> Hence it is expected to be on primary on every reboot.
> 
> 	If I set up a basic bonding configuration like:
> 
Extremely thankful to U for investigating it to this extent.
Need to admit our test requirement here. Reboot test is done 300 times
and requirement is all 300 should see bond intf on primary upon boot.
What is observed is, reboot test never could achieve this until we put
this change.

> [ eth3, eth4 ] -> bond0 -> bond0.66, with primary=eth3 primary_reselect=2
> 
Our config is complex. Here (please ask if not clear):
First niC card  Intel 82599 sr-iov PF0 -> eth0 [dom0]
                                   VF0 -> eth1 [domU] ->
                                                        bond0 -> bond0.1
                                   VF0 -> eth2 [domU] ->
Second niC card Intel 82599 sr-iov PF0 -> eth1 [dom0]

PF = Physical Func
VF = Virtual  Func [are bond slaves]
XEN based OVM [Oracle]

2 arp targets configured.
A lan switch connected to *each* PF hosts a vlan arp target.

AFAIK sr-iov code (can get clarification from Donald/Keller sr-iov
owning team, as we did temp fix of another issue in intel sr-iov driver,
so we are connected) addition of vlan's to sr-iov functions is *as*
simple adding an entry into a table. That is they don't go thru a
re-init like an mtu change. Meaning the intf stays UP during that time.
*But* takes a little while as this table addition is done by PF driver
in dom0 upon request by domU VF driver as there is MBox communication.

> 	Then look at dmesg, I see this sequence:
> 
> 	The bond is set up first, with an arp_ip_target on a VLAN
> destination.  The slaves are added to the bond.
> 
> 	The VLAN interface is configured above the bond, and brought up.
> 
> 	The slaves become link up after autonegotiation, the ARP monitor
> commences, and eth3 is made the active slave.  Even if eth4 is set by
> the bond to be "link status up," eth3 becomes the active slave when it
> becomes "link status up."
> 
> 	What network device are you using for the slaves?  Are they
> virtualized devices of some kind?  My suspicion is that Ethernet
> autonegotiation either does not take place or occurs so quickly that the
> slaves are carrier up before the VLAN is even added.
> 
That is so correct of a guess.
We never saw a link loss message, neither nic driver nor bonding
reporting during vlan addition. Because as carrier is up bond has a
curr_active_slave hence tries to arp but fails as ip_route_output() fails.

> 	Can you check your dmesg output for the sequence of events?  In
> my test, I do not see the slaves go "NIC Link is Up 1000 Mbps Full
> Duplex" until about 3 seconds after the VLAN interface has been
> configured.
> 
so it means bonding driver would not have had a curr_active_slave for
almost 3 secs as none slave would have qualified to become so, hence
bonding never even would attempted to send an arp out.

What we observed is "no route to arp_ip_target" message.
We see 10 of them (5 pairs, as there 2 arp targets and arp interval is
200 msecs) lasting for a 1 sec, then no more error messages. With out
this fix, we would see that bonding failover messages in between those
messages. Eventually when arp'ing is working bonding would be on some
interface, need not be primary.  With this fix, we would not see those
bonding failover messages, and when arp'ing is working bond is on primary.

Requirement of this product being built is to achieve an extremely quick
failover, hence 200 msecs arp interval.

> 
> 	-J
> 
> ---
> 	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/bonding/bond_main.c 
b/drivers/net/bonding/bond_main.c
index dde6b4a..d475161 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2661,7 +2661,7 @@  static int bond_has_this_ip(struct bonding *bond, 
__be32 ip)
   * switches in VLAN mode (especially if ports are configured as
   * "native" to a VLAN) might not pass non-tagged frames.
   */
-static void bond_arp_send(struct net_device *slave_dev, int arp_op, 
__be32 dest_ip, __be32 src_ip, unsigned short vlan_id)
+static void bond_arp_send(struct bonding *bond, struct net_device 
*slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, unsigned short 
vlan_id)
  {
  	struct sk_buff *skb;
  @@ -2683,6 +2683,7 @@ static void bond_arp_send(struct net_device 
*slave_dev, int arp_op, __be32 dest_
  		}
  	}
  	arp_xmit(skb);
+	bond->arp_sent=true;
  }
   @@ -2700,7 +2701,7 @@ static void bond_arp_send_all(struct bonding 
*bond, struct slave *slave)
  		pr_debug("basa: target %x\n", targets[i]);
  		if (!bond->vlgrp) {
  			pr_debug("basa: empty vlan: arp_send\n");
-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
  				      bond->master_ip, 0);
  			continue;
  		}
@@ -2726,7 +2727,7 @@  static void bond_arp_send_all(struct bonding 
*bond, struct slave *slave)
  		if (rt->dst.dev == bond->dev) {
  			ip_rt_put(rt);
  			pr_debug("basa: rtdev == bond->dev: arp_send\n");
-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
  				      bond->master_ip, 0);
  			continue;
  		}
@@ -2744,7 +2745,7 @@  static void bond_arp_send_all(struct bonding 
*bond, struct slave *slave)
   		if (vlan_id) {
  			ip_rt_put(rt);
-			bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i],
+			bond_arp_send(bond, slave->dev, ARPOP_REQUEST, targets[i],
  				      vlan->vlan_ip, vlan_id);
  			continue;
  		}
@@ -3206,7 +3207,7 @@  void bond_activebackup_arp_mon(struct work_struct 
*work)
   	should_notify_peers = bond_should_notify_peers(bond);
  -	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
+	if (bond->arp_sent && bond_ab_arp_inspect(bond, delta_in_ticks)) {
  		read_unlock(&bond->lock);
  		rtnl_lock();
  		read_lock(&bond->lock);
@@ -3218,6 +3219,7 @@  void bond_activebackup_arp_mon(struct work_struct 
*work)
  		read_lock(&bond->lock);
  	}
  +	bond->arp_sent=false;
  	bond_ab_arp_probe(bond);
   re_arm:
@@ -4425,6 +4427,7 @@  static void bond_setup(struct net_device *bond_dev)
   	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM);
  	bond_dev->features |= bond_dev->hw_features;
+	bond->arp_sent=false;
  }
   static void bond_work_cancel_all(struct bonding *bond)
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index e9a3c56..3878bbd 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -253,6 +253,7 @@  struct bonding {
  	/* debugging suport via debugfs */
  	struct	 dentry *debug_dir;
  #endif /* CONFIG_DEBUG_FS */
+        bool arp_sent;
  };
   #define bond_slave_get_rcu(dev) \
-- 
1.8.2.1