diff mbox

Experimental new bonding driver mode=batman

Message ID 555AD3A7.7050202@netscape.net
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Patrick Simmons May 19, 2015, 6:09 a.m. UTC
Hello,

I've written a new mode for the kernel bonding driver.  It's similar to
the round-robin mode, but it keeps statistics on TCP resends so as to
favor slave devices with more bandwidth when choosing where to send
packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
to work okay, but it should be considered experimental.

Still, it should be safe to apply this patch as it doesn't do anything
unless you specify "mode=batman" on the driver command line.

Signed-off-by: Patrick Simmons <linuxrocks123@netscape.net>

Comments

Michal Kubecek May 19, 2015, 7:49 a.m. UTC | #1
On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
> 
> I've written a new mode for the kernel bonding driver.  It's similar to
> the round-robin mode, but it keeps statistics on TCP resends so as to
> favor slave devices with more bandwidth when choosing where to send
> packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
> to work okay, but it should be considered experimental.

A description of how is the mode supposed to work would be definitely
helpful.

Below are some comments from a quick look over the patch, I didn't think
too much about the logic of the operation.

> +struct batnode
> +{
> +     struct batnode* next_batnode;
> +     char slvname[IFNAMSIZ];
> +     unsigned long jiffiestamp;
> +     u32 tcp_seq_num;
> +     u16 port_id;
> +};
> +
> +struct slave_congestion_node
> +{
> +     struct slave_congestion_node* next_slave;
> +     char this_slave[IFNAMSIZ];
> +     u32 congestion_counter;
> +};

Referencing the slave by its name doesn't seem very efficient,
especially if you are going to do it in the xmit handler.

> +#define BATTABLE_SIZE 256
> +static struct batnode battable[BATTABLE_SIZE];
> +static struct slave_congestion_node* slavelist_head_ptr = NULL;

Why one global list rather than one list per bond? And why do you need
your own list in addition to already existing slave structures?

> +static struct slave_congestion_node* get_slave_congestion_node(char* slvname)
> +{
> +     //printk("In get_slave_congestion_node: 0x%zx\n",the_slave);
> +     struct slave_congestion_node* cur_slave = slavelist_head_ptr;
> +     while(cur_slave)
> +     {
> +          if(strcmp(cur_slave->this_slave,slvname)==0)
> +          {
> +               //printk("Found a match: 0x%zx\n",cur_slave);
> +               return cur_slave;
> +          }
> +
> +          if(!cur_slave->next_slave)
> +               break;
> +          cur_slave = cur_slave->next_slave;
> +     }     
> +
> +     //Create new slave node
> +     if(!cur_slave) //special case head
> +     {
> +          cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC));
> +          slavelist_head_ptr = cur_slave;
> +     }
> +     else
> +     {
> +          cur_slave->next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC));
> +          cur_slave = cur_slave->next_slave;
> +     }

You don't handle kmalloc() failure.

> +
> +     //Initialize new slave node
> +     cur_slave->next_slave = NULL;
> +     strlcpy(cur_slave->this_slave,slvname,IFNAMSIZ);
> +     cur_slave->congestion_counter = 1;
> +     
> +     //printk("Created new congestion node: 0x%zx\n",cur_slave);
> +     return cur_slave;
> +}
> +
> +static void slave_congestion_increment(char* the_slave)
> +{
> +     get_slave_congestion_node(the_slave)->congestion_counter++;
> +}
> +
> +static void slave_congestion_decrement(char* the_slave)
> +{
> +     get_slave_congestion_node(the_slave)->congestion_counter--;
> +}

Should these three really be in a header file?

> +static DEFINE_SPINLOCK(batlock);

A lot of effort has been done recently to get rid of per-bond spinlocks.
Adding a global one kind of goes against this work.

> @@ -1082,6 +1088,69 @@ static bool bond_should_deliver_exact_ma
>  	return false;
>  }
>  
> +static void batman_normalize_congestion(struct bonding* bond, int* congestion)
> +{
> +     static long unsigned int jiffiestamp = 0;
> +     long unsigned int jiffie_temp = jiffies;
> +     struct slave* slave;
> +     struct list_head* i;
> +     if(jiffie_temp!=jiffiestamp && jiffie_temp%1000==0) //every thousand packets, normalize congestion

Looks more like every 1000 jiffies, not packets. And what if this
function doesn't get called in that one jiffy when jiffies is a multiple
of 1000? (Similar problem later in bond_xmit_batman().)

> +          printk(KERN_DEBUG "batman: congestion normalization has occurred.\n");

Use pr_debug() or netdev_dbg().

> +static int batman_timer_invoke(void* bond_)
> +{
> +     struct bonding* bond = (struct bonding*)(bond_);
> +     struct slave* slave;
> +     struct list_head* i;
> +
> +     while(!kthread_should_stop())
> +     {
> +          unsigned long batflags;
> +          long unsigned int jiffiestamp = jiffies;
> +          
> +          msleep(10*1000);
> +          rcu_read_lock();
> +          spin_lock_irqsave(&batlock,batflags);
> +          
> +          long unsigned int jiffie_temp = jiffies;
> +          if(!(jiffie_temp - jiffiestamp < 5 * HZ))
> +               bond_for_each_slave_rcu(bond,slave,i)
> +               {
> +                    struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
> +                    if(congest_current->congestion_counter >= 2)
> +                         congest_current->congestion_counter/=2;
> +               }
> +
> +          spin_unlock_irqrestore(&batlock,batflags);
> +          rcu_read_unlock();
> +     }
> +}

This seems to only run something periodically, why do you need a (per
bond) kthread instead of a simple timer?

> @@ -1100,7 +1169,67 @@ static rx_handler_result_t bond_handle_f
>  	slave = bond_slave_get_rcu(skb->dev);
>  	bond = slave->bond;
>  
> -	recv_probe = ACCESS_ONCE(bond->recv_probe);
> +    //This seems as good a place as any to put this.
> +    if(skb->protocol == htons(ETH_P_IP))
> +    {
...
> +    }

I would suggest to handle TCP over IPv6 as well.

> @@ -4068,6 +4419,12 @@ void bond_setup(struct net_device *bond_
>  	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM);
>  	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>  	bond_dev->features |= bond_dev->hw_features;
> +
> +    if(bond_mode == BOND_MODE_BATMAN)
> +    {
> +         printk(KERN_INFO "%s: I'm Batman.\n",bond_dev->name);
> +         bond->battimer_task = kthread_run(batman_timer_invoke,bond,"%s_batman",bond_dev->name);
> +    }    
>  }
>  
>  /* Destroy a bonding device.

You only start the kthread in bond_setup() and stop it in
bond_destructor(). This doesn't handle the situation when the mode is
changed after the bond is created.

One general note: the patch breaks kernel coding style in a lot of
points. Running checkpatch.pl on it, I got

  total: 562 errors, 466 warnings, 5 checks, 585 lines checked

The script isn't perfect but this is too much.

                                                       Michal Kubecek

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jonathan Toppins May 19, 2015, 7:58 a.m. UTC | #2
On 5/19/15 2:09 AM, Patrick Simmons wrote:
> Hello,
>
> I've written a new mode for the kernel bonding driver.  It's similar to
> the round-robin mode, but it keeps statistics on TCP resends so as to
> favor slave devices with more bandwidth when choosing where to send
> packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
> to work okay, but it should be considered experimental.
>
> Still, it should be safe to apply this patch as it doesn't do anything
> unless you specify "mode=batman" on the driver command line.
>
> Signed-off-by: Patrick Simmons <linuxrocks123@netscape.net>
>

It is usually a good idea to add the bonding maintainers directly, have 
added them. See <kernel>/MAINTAINERS for list.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick Simmons May 19, 2015, 8:29 a.m. UTC | #3
On 05/19/2015 03:49 AM, Michal Kubecek wrote:
> On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
>>
>> I've written a new mode for the kernel bonding driver.  It's similar to
>> the round-robin mode, but it keeps statistics on TCP resends so as to
>> favor slave devices with more bandwidth when choosing where to send
>> packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
>> to work okay, but it should be considered experimental.
>
> A description of how is the mode supposed to work would be definitely
> helpful.
>

Rationale: It's helpful for cases where the slave devices have 
significantly different or varying bandwidth.  The reason I wrote it is 
to bond powerline networking and wireless networking adapters into a 
single interface for use with connecting to a MythTV server.  Neither of 
these systems is particularly reliable with bandwidth, but mode=batman 
can adaptively figure out which network has more available bandwidth at 
any given moment.  This is better than mode=round-robin which always 
balances everything 50/50.

Regarding your analysis, I appreciate your comments, and I know it's 
rough, but I'm sorry to say I'm not really interested in doing much to 
improve its polish past where it is.  If it fails some way when I try to 
deploy it, then I'll fix that, and maybe I'll play around with the 
balancing heuristics, but the code quality is what it is unless someone 
else wants to improve it.  I would fix the indentation if that would 
make it acceptable for you to merge it, but not much more.  My argument 
for merging it is basically "it doesn't do anything unless you pass 
mode=batman, so what's the harm?".

So, if you guys decide you don't want to merge it because of the global 
spinlock etc., that's cool and I understand, but I thought I should at 
least post to this list so you and any other potentially interested 
people know it exists.  Oh, and, if you're not going to merge it, please 
let me know so I can know post the patch to GitHub or somewhere.  And, 
if you could include a note in the comments at the top of bond_main.c or 
somewhere pointing people to the patch, I'd very much appreciate that. 
I don't want anyone else to have to endure hours of kernel rebuilds with 
KASAN enabled if they want this functionality :)

Cheers and have a good one!

--Patrick
Michal Kubecek May 20, 2015, 7:32 a.m. UTC | #4
On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote:
> On 05/19/2015 03:49 AM, Michal Kubecek wrote:
> >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
> >>
> >>I've written a new mode for the kernel bonding driver.  It's similar to
> >>the round-robin mode, but it keeps statistics on TCP resends so as to
> >>favor slave devices with more bandwidth when choosing where to send
> >>packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
> >>to work okay, but it should be considered experimental.
> >
> >A description of how is the mode supposed to work would be definitely
> >helpful.
> >
> 
> Rationale: It's helpful for cases where the slave devices have
> significantly different or varying bandwidth.  The reason I wrote it
> is to bond powerline networking and wireless networking adapters
> into a single interface for use with connecting to a MythTV server.
> Neither of these systems is particularly reliable with bandwidth,
> but mode=batman can adaptively figure out which network has more
> available bandwidth at any given moment.  This is better than
> mode=round-robin which always balances everything 50/50.

Thank you. But I rather meant some basic description of the algorithm
used to achieve this goal. Both should be IMHO part of the commit
message.

> Regarding your analysis, I appreciate your comments, and I know it's
> rough, but I'm sorry to say I'm not really interested in doing much
> to improve its polish past where it is.  If it fails some way when I
> try to deploy it, then I'll fix that, and maybe I'll play around
> with the balancing heuristics, but the code quality is what it is
> unless someone else wants to improve it.  I would fix the
> indentation if that would make it acceptable for you to merge it,
> but not much more.  My argument for merging it is basically "it
> doesn't do anything unless you pass mode=batman, so what's the
> harm?".
> 
> So, if you guys decide you don't want to merge it because of the
> global spinlock etc., that's cool and I understand, but I thought I
> should at least post to this list so you and any other potentially
> interested people know it exists.  Oh, and, if you're not going to
> merge it, please let me know so I can know post the patch to GitHub
> or somewhere.  And, if you could include a note in the comments at
> the top of bond_main.c or somewhere pointing people to the patch,
> I'd very much appreciate that. I don't want anyone else to have to
> endure hours of kernel rebuilds with KASAN enabled if they want this
> functionality :)

Well, it's not my call, I'm not a bonding maintainer. But I believe at
least some of the objections would be shared by them. Of course, it's up
to you if you want to dedicate your time to improving the code to be
acceptable for mainline or rather maintain it out of tree (which may end
up taking even more time in the long term).

                                                          Michal Kubecek

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jay Vosburgh May 20, 2015, 5:45 p.m. UTC | #5
Michal Kubecek <mkubecek@suse.cz> wrote:

>On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote:
>> On 05/19/2015 03:49 AM, Michal Kubecek wrote:
>> >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
>> >>
>> >>I've written a new mode for the kernel bonding driver.  It's similar to
>> >>the round-robin mode, but it keeps statistics on TCP resends so as to
>> >>favor slave devices with more bandwidth when choosing where to send
>> >>packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
>> >>to work okay, but it should be considered experimental.
>> >
>> >A description of how is the mode supposed to work would be definitely
>> >helpful.
>> >
>> 
>> Rationale: It's helpful for cases where the slave devices have
>> significantly different or varying bandwidth.  The reason I wrote it
>> is to bond powerline networking and wireless networking adapters
>> into a single interface for use with connecting to a MythTV server.
>> Neither of these systems is particularly reliable with bandwidth,
>> but mode=batman can adaptively figure out which network has more
>> available bandwidth at any given moment.  This is better than
>> mode=round-robin which always balances everything 50/50.
>
>Thank you. But I rather meant some basic description of the algorithm
>used to achieve this goal. Both should be IMHO part of the commit
>message.

	Agreed; the concept sounds interesting, but without a detailed
description of how it works it is difficult to evaluate its value.

>> Regarding your analysis, I appreciate your comments, and I know it's
>> rough, but I'm sorry to say I'm not really interested in doing much
>> to improve its polish past where it is.  If it fails some way when I
>> try to deploy it, then I'll fix that, and maybe I'll play around
>> with the balancing heuristics, but the code quality is what it is
>> unless someone else wants to improve it.  I would fix the
>> indentation if that would make it acceptable for you to merge it,
>> but not much more.  My argument for merging it is basically "it
>> doesn't do anything unless you pass mode=batman, so what's the
>> harm?".
>> 
>> So, if you guys decide you don't want to merge it because of the
>> global spinlock etc., that's cool and I understand, but I thought I
>> should at least post to this list so you and any other potentially
>> interested people know it exists.  Oh, and, if you're not going to
>> merge it, please let me know so I can know post the patch to GitHub
>> or somewhere.  And, if you could include a note in the comments at
>> the top of bond_main.c or somewhere pointing people to the patch,
>> I'd very much appreciate that. I don't want anyone else to have to
>> endure hours of kernel rebuilds with KASAN enabled if they want this
>> functionality :)
>
>Well, it's not my call, I'm not a bonding maintainer. But I believe at
>least some of the objections would be shared by them. Of course, it's up
>to you if you want to dedicate your time to improving the code to be
>acceptable for mainline or rather maintain it out of tree (which may end
>up taking even more time in the long term).

	Well, I am a bonding maintainer, and I can say that the patch in
its current state is not suitable for inclusion.

	At a minimum, there are many coding style issues, commented out
debug statements, etc, along with design issues (e.g., the batman mode
handling in bond_handle_frame is unconditional and takes place for all
modes, not just the new batman mode).

	If you (Patrick) or someone else wishes to contribute this to
mainline, I'd suggest that the first step is to read and follow the
instructions in Documentation/SubmittingPatches in the kernel source
code.

	It is also not feasible to add pointers in the kernel source
code to out-of-tree patches; sorry.

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Gospodarek May 20, 2015, 8:04 p.m. UTC | #6
On Wed, May 20, 2015 at 10:45:33AM -0700, Jay Vosburgh wrote:
> Michal Kubecek <mkubecek@suse.cz> wrote:
> 
> >On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote:
> >> On 05/19/2015 03:49 AM, Michal Kubecek wrote:
> >> >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
> >> >>
> >> >>I've written a new mode for the kernel bonding driver.  It's similar to
> >> >>the round-robin mode, but it keeps statistics on TCP resends so as to
> >> >>favor slave devices with more bandwidth when choosing where to send
> >> >>packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
> >> >>to work okay, but it should be considered experimental.
> >> >
> >> >A description of how is the mode supposed to work would be definitely
> >> >helpful.
> >> >
> >> 
> >> Rationale: It's helpful for cases where the slave devices have
> >> significantly different or varying bandwidth.  The reason I wrote it
> >> is to bond powerline networking and wireless networking adapters
> >> into a single interface for use with connecting to a MythTV server.
> >> Neither of these systems is particularly reliable with bandwidth,
> >> but mode=batman can adaptively figure out which network has more
> >> available bandwidth at any given moment.  This is better than
> >> mode=round-robin which always balances everything 50/50.
> >
> >Thank you. But I rather meant some basic description of the algorithm
> >used to achieve this goal. Both should be IMHO part of the commit
> >message.
> 
> 	Agreed; the concept sounds interesting, but without a detailed
> description of how it works it is difficult to evaluate its value.
> 
> >> Regarding your analysis, I appreciate your comments, and I know it's
> >> rough, but I'm sorry to say I'm not really interested in doing much
> >> to improve its polish past where it is.  If it fails some way when I
> >> try to deploy it, then I'll fix that, and maybe I'll play around
> >> with the balancing heuristics, but the code quality is what it is
> >> unless someone else wants to improve it.  I would fix the
> >> indentation if that would make it acceptable for you to merge it,
> >> but not much more.  My argument for merging it is basically "it
> >> doesn't do anything unless you pass mode=batman, so what's the
> >> harm?".
> >> 
> >> So, if you guys decide you don't want to merge it because of the
> >> global spinlock etc., that's cool and I understand, but I thought I
> >> should at least post to this list so you and any other potentially
> >> interested people know it exists.  Oh, and, if you're not going to
> >> merge it, please let me know so I can know post the patch to GitHub
> >> or somewhere.  And, if you could include a note in the comments at
> >> the top of bond_main.c or somewhere pointing people to the patch,
> >> I'd very much appreciate that. I don't want anyone else to have to
> >> endure hours of kernel rebuilds with KASAN enabled if they want this
> >> functionality :)
> >
> >Well, it's not my call, I'm not a bonding maintainer. But I believe at
> >least some of the objections would be shared by them. Of course, it's up
> >to you if you want to dedicate your time to improving the code to be
> >acceptable for mainline or rather maintain it out of tree (which may end
> >up taking even more time in the long term).
> 
> 	Well, I am a bonding maintainer, and I can say that the patch in
> its current state is not suitable for inclusion.
> 
> 	At a minimum, there are many coding style issues, commented out
> debug statements, etc, along with design issues (e.g., the batman mode
> handling in bond_handle_frame is unconditional and takes place for all
> modes, not just the new batman mode).
> 
> 	If you (Patrick) or someone else wishes to contribute this to
> mainline, I'd suggest that the first step is to read and follow the
> instructions in Documentation/SubmittingPatches in the kernel source
> code.
> 
> 	It is also not feasible to add pointers in the kernel source
> code to out-of-tree patches; sorry.

Well I can now delete most of my initial response.  :)

Overall I would say this is really cool functionality.  Even if you do
not want it merged, I think it is great that you shared it with the
community this way.  I got a chance to look at is a bit this morning and
I agree additional explanation of the algorithm you are using would
probably be nice for those checking this out for the first time.

I also think if you would be able to leverage the exiting bonding infra
for using skb->queue_mapping you could probably add the same
functionality (though it might be higher in the stack), but I totally
understand if you want to just keep using what you are using as-is.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick Simmons May 20, 2015, 10:37 p.m. UTC | #7
On 05/20/2015 01:45 PM, Jay Vosburgh wrote:
> Michal Kubecek <mkubecek@suse.cz> wrote:
>
>> On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote:
>>> On 05/19/2015 03:49 AM, Michal Kubecek wrote:
>>>> On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote:
>>>>>
>>>>> I've written a new mode for the kernel bonding driver.  It's similar to
>>>>> the round-robin mode, but it keeps statistics on TCP resends so as to
>>>>> favor slave devices with more bandwidth when choosing where to send
>>>>> packets.  I've tested it on two laptops using WiFi/Ethernet and it seems
>>>>> to work okay, but it should be considered experimental.
>>>>
>>>> A description of how is the mode supposed to work would be definitely
>>>> helpful.
>>>>
>>>
>>> Rationale: It's helpful for cases where the slave devices have
>>> significantly different or varying bandwidth.  The reason I wrote it
>>> is to bond powerline networking and wireless networking adapters
>>> into a single interface for use with connecting to a MythTV server.
>>> Neither of these systems is particularly reliable with bandwidth,
>>> but mode=batman can adaptively figure out which network has more
>>> available bandwidth at any given moment.  This is better than
>>> mode=round-robin which always balances everything 50/50.
>>
>> Thank you. But I rather meant some basic description of the algorithm
>> used to achieve this goal. Both should be IMHO part of the commit
>> message.
>
> 	Agreed; the concept sounds interesting, but without a detailed
> description of how it works it is difficult to evaluate its value.
>

The basic algorithm is that each device starts out with a single 
"point".  When a packet drop occurs and another device has been selected 
to retransmit the packet, that device gets another point and, if the 
first device has more than one point, the first device (the one that 
failed to successfully get the packet across) loses a point.  At every 
packet send, all the points of all devices are added up, and each device 
has a (num_points)/(sum of all points) chance of being chosen to send 
the packet.

Every 10 seconds, all devices points are halved (unless the device only 
has one point).  If the bonding device is approximately idle for 60 
seconds, all devices' points are reset to 1 and the algorithm starts over.

Additionally, no slave ever retransmits a packet it itself sent unless 
it is the only active slave; but, if the slave WOULD HAVE retransmitted 
its own packet according to the algorithm, it doesn't lose any points 
and the other slave doesn't get any.

>>> Regarding your analysis, I appreciate your comments, and I know it's
>>> rough, but I'm sorry to say I'm not really interested in doing much
>>> to improve its polish past where it is.  If it fails some way when I
>>> try to deploy it, then I'll fix that, and maybe I'll play around
>>> with the balancing heuristics, but the code quality is what it is
>>> unless someone else wants to improve it.  I would fix the
>>> indentation if that would make it acceptable for you to merge it,
>>> but not much more.  My argument for merging it is basically "it
>>> doesn't do anything unless you pass mode=batman, so what's the
>>> harm?".
>>>
>>> So, if you guys decide you don't want to merge it because of the
>>> global spinlock etc., that's cool and I understand, but I thought I
>>> should at least post to this list so you and any other potentially
>>> interested people know it exists.  Oh, and, if you're not going to
>>> merge it, please let me know so I can know post the patch to GitHub
>>> or somewhere.  And, if you could include a note in the comments at
>>> the top of bond_main.c or somewhere pointing people to the patch,
>>> I'd very much appreciate that. I don't want anyone else to have to
>>> endure hours of kernel rebuilds with KASAN enabled if they want this
>>> functionality :)
>>
>> Well, it's not my call, I'm not a bonding maintainer. But I believe at
>> least some of the objections would be shared by them. Of course, it's up
>> to you if you want to dedicate your time to improving the code to be
>> acceptable for mainline or rather maintain it out of tree (which may end
>> up taking even more time in the long term).

It'll probably just bit-rot :(  The next time it gets looked at will 
probably be in 4-5 years when one of the MythTV computers dies and I 
have to replace it and use a newer kernel because it has new hardware 
that isn't supported.

But it is now and will remain indefinitely at

http://moongate.ydns.eu/bond_mode_batman.patch

for anyone who wants it.

>
> 	Well, I am a bonding maintainer, and I can say that the patch in
> its current state is not suitable for inclusion.
>
> 	At a minimum, there are many coding style issues, commented out
> debug statements, etc, along with design issues (e.g., the batman mode
> handling in bond_handle_frame is unconditional and takes place for all
> modes, not just the new batman mode).
>
> 	If you (Patrick) or someone else wishes to contribute this to
> mainline, I'd suggest that the first step is to read and follow the
> instructions in Documentation/SubmittingPatches in the kernel source
> code.
>
> 	It is also not feasible to add pointers in the kernel source
> code to out-of-tree patches; sorry.
>
> 	-J
>
> ---
> 	-Jay Vosburgh, jay.vosburgh@canonical.com
>

Fair enough.  I actually did read SubmittingPatches already, but thank 
you for pointing it out.  I could easily fix the coding style issues (I 
imagine someone's written a "Linux kernel indentation mode" for emacs by 
now), and I could of course put an if statement around the stuff in 
bond_handle_frame (that's a bug).  What I wouldn't be interested in 
doing would be things like getting rid of the mode-specific spinlocks, 
which someone mentioned earlier as a potential issue.

Basically, I'll do bug fixes etc. if that would make it suitable, but 
not anything like design changes that would be likely to make me have to 
debug the thing again.  So, I guess my question is, would design changes 
be necessary, or would it be suitable if I fixed the indentation, 
cleaned up the debug comments, fixed the bug in bond_handle_frame, and 
did something to handle kmalloc failures?

By the way, I do appreciate the time you all have taken to look at this, 
and thank you for CCing me and please continue to do so because I didn't 
realize the list was so heavy traffic, and so I unsubscribed.

--Patrick Simmons
Patrick Simmons May 20, 2015, 10:48 p.m. UTC | #8
On 05/20/2015 04:04 PM, Andy Gospodarek wrote:
>
> Well I can now delete most of my initial response.  :)
>
> Overall I would say this is really cool functionality.  Even if you do
> not want it merged, I think it is great that you shared it with the
> community this way.  I got a chance to look at is a bit this morning and
> I agree additional explanation of the algorithm you are using would
> probably be nice for those checking this out for the first time.

Thanks :)  I explained the algorithm in my last email to the list, so I 
won't repeat it here.

>
> I also think if you would be able to leverage the exiting bonding infra
> for using skb->queue_mapping you could probably add the same
> functionality (though it might be higher in the stack), but I totally
> understand if you want to just keep using what you are using as-is.
>
>

I didn't know about the queue_mapping thing.  Thanks for the pointer.

One thing I considered doing that I didn't ultimately do was try to 
implement this as a TUN device.  Userspace coding is always easier and, 
in retrospect, finding some way to manually parse IP and TCP packet 
headers would likely have been trying to debug kernel memory corruption 
bugs.  But another downside would have been that I'm pretty sure all the 
slaves would have had to have been set to promiscuous mode to make it work.

--Patrick Simmons
diff mbox

Patch

--- tree_a/include/uapi/linux/if_bonding.h	2015-05-06 16:04:23.000000000 -0400
+++ tree_b/include/uapi/linux/if_bonding.h	2015-05-18 14:49:16.916558030 -0400
@@ -70,6 +70,7 @@ 
 #define BOND_MODE_8023AD        4
 #define BOND_MODE_TLB           5
 #define BOND_MODE_ALB		6 /* TLB + RLB (receive load balancing) */
+#define BOND_MODE_BATMAN        7
 
 /* each slave's link has 4 states */
 #define BOND_LINK_UP    0           /* link is up and running */
--- tree_a/include/net/bonding.h	2015-05-06 16:04:23.000000000 -0400
+++ tree_b/include/net/bonding.h	2015-05-18 16:54:46.452900354 -0400
@@ -15,6 +15,7 @@ 
 #ifndef _NET_BONDING_H
 #define _NET_BONDING_H
 
+#include <linux/kthread.h>
 #include <linux/timer.h>
 #include <linux/proc_fs.h>
 #include <linux/if_bonding.h>
@@ -241,6 +242,7 @@  struct bonding {
 	struct	 dentry *debug_dir;
 #endif /* CONFIG_DEBUG_FS */
 	struct rtnl_link_stats64 bond_stats;
+    struct task_struct* battimer_task;
 };
 
 #define bond_slave_get_rcu(dev) \
@@ -650,6 +652,22 @@  static inline int bond_get_targets_ip(__
 	return -1;
 }
 
+struct batnode
+{
+     struct batnode* next_batnode;
+     char slvname[IFNAMSIZ];
+     unsigned long jiffiestamp;
+     u32 tcp_seq_num;
+     u16 port_id;
+};
+
+struct slave_congestion_node
+{
+     struct slave_congestion_node* next_slave;
+     char this_slave[IFNAMSIZ];
+     u32 congestion_counter;
+};
+
 /* exported from bond_main.c */
 extern int bond_net_id;
 extern const struct bond_parm_tbl bond_lacp_tbl[];
@@ -660,6 +678,58 @@  extern const struct bond_parm_tbl fail_o
 extern const struct bond_parm_tbl pri_reselect_tbl[];
 extern struct bond_parm_tbl ad_select_tbl[];
 
+#define BATTABLE_SIZE 256
+static struct batnode battable[BATTABLE_SIZE];
+static struct slave_congestion_node* slavelist_head_ptr = NULL;
+
+static struct slave_congestion_node* get_slave_congestion_node(char* slvname)
+{
+     //printk("In get_slave_congestion_node: 0x%zx\n",the_slave);
+     struct slave_congestion_node* cur_slave = slavelist_head_ptr;
+     while(cur_slave)
+     {
+          if(strcmp(cur_slave->this_slave,slvname)==0)
+          {
+               //printk("Found a match: 0x%zx\n",cur_slave);
+               return cur_slave;
+          }
+
+          if(!cur_slave->next_slave)
+               break;
+          cur_slave = cur_slave->next_slave;
+     }     
+
+     //Create new slave node
+     if(!cur_slave) //special case head
+     {
+          cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC));
+          slavelist_head_ptr = cur_slave;
+     }
+     else
+     {
+          cur_slave->next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC));
+          cur_slave = cur_slave->next_slave;
+     }
+
+     //Initialize new slave node
+     cur_slave->next_slave = NULL;
+     strlcpy(cur_slave->this_slave,slvname,IFNAMSIZ);
+     cur_slave->congestion_counter = 1;
+     
+     //printk("Created new congestion node: 0x%zx\n",cur_slave);
+     return cur_slave;
+}
+
+static void slave_congestion_increment(char* the_slave)
+{
+     get_slave_congestion_node(the_slave)->congestion_counter++;
+}
+
+static void slave_congestion_decrement(char* the_slave)
+{
+     get_slave_congestion_node(the_slave)->congestion_counter--;
+}
+
 /* exported from bond_netlink.c */
 extern struct rtnl_link_ops bond_link_ops;
 
--- tree_a/drivers/net/bonding/bond_main.c	2015-05-06 16:04:23.000000000 -0400
+++ tree_b/drivers/net/bonding/bond_main.c	2015-05-18 20:17:46.914627467 -0400
@@ -71,6 +71,8 @@ 
 #include <linux/if_bonding.h>
 #include <linux/jiffies.h>
 #include <linux/preempt.h>
+#include <linux/spinlock_types.h>
+#include <linux/spinlock.h>
 #include <net/route.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -135,7 +137,7 @@  module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
 		       "1 for active-backup, 2 for balance-xor, "
 		       "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
-		       "6 for balance-alb");
+		 "6 for balance-alb, 7 for batman");
 module_param(primary, charp, 0);
 MODULE_PARM_DESC(primary, "Primary network device to use");
 module_param(primary_reselect, charp, 0);
@@ -205,6 +207,8 @@  static int bond_mode	= BOND_MODE_ROUNDRO
 static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
 static int lacp_fast;
 
+static DEFINE_SPINLOCK(batlock);
+
 /*-------------------------- Forward declarations ---------------------------*/
 
 static int bond_init(struct net_device *bond_dev);
@@ -225,9 +229,10 @@  const char *bond_mode_name(int mode)
 		[BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
 		[BOND_MODE_TLB] = "transmit load balancing",
 		[BOND_MODE_ALB] = "adaptive load balancing",
+        [BOND_MODE_BATMAN] = "load balancing (Batman)",
 	};
 
-	if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
+	if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_BATMAN)
 		return "unknown";
 
 	return names[mode];
@@ -856,7 +861,8 @@  void bond_change_active_slave(struct bon
 	 */
 	if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
 	    ((bond_uses_primary(bond) && new_active) ||
-	     BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
+	     BOND_MODE(bond) == BOND_MODE_ROUNDROBIN ||
+	     BOND_MODE(bond) == BOND_MODE_BATMAN)) {
 		bond->igmp_retrans = bond->params.resend_igmp;
 		queue_delayed_work(bond->wq, &bond->mcast_work, 1);
 	}
@@ -1082,6 +1088,69 @@  static bool bond_should_deliver_exact_ma
 	return false;
 }
 
+static void batman_normalize_congestion(struct bonding* bond, int* congestion)
+{
+     static long unsigned int jiffiestamp = 0;
+     long unsigned int jiffie_temp = jiffies;
+     struct slave* slave;
+     struct list_head* i;
+     if(jiffie_temp!=jiffiestamp && jiffie_temp%1000==0) //every thousand packets, normalize congestion
+     {
+          printk(KERN_DEBUG "batman: congestion normalization has occurred.\n");
+          bond_for_each_slave_rcu(bond,slave,i)
+          {
+               struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+               if(congest_current->congestion_counter > 1)
+               {
+                    (*congestion)--;
+                    congest_current->congestion_counter--;
+               }
+          }
+          jiffiestamp = jiffie_temp;
+     }
+     else if((jiffie_temp - jiffiestamp) / HZ > 60)
+     {
+          printk(KERN_DEBUG "batman: failsafe reset invoked: %lu, %lu, %lu\n",jiffie_temp, jiffiestamp, (jiffie_temp - jiffiestamp) / HZ);
+          *congestion = 0;
+          bond_for_each_slave_rcu(bond,slave,i)
+          {
+               struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+               congest_current->congestion_counter = 1;
+               (*congestion)++;
+          }
+          jiffiestamp = jiffie_temp;
+     }
+}
+
+static int batman_timer_invoke(void* bond_)
+{
+     struct bonding* bond = (struct bonding*)(bond_);
+     struct slave* slave;
+     struct list_head* i;
+
+     while(!kthread_should_stop())
+     {
+          unsigned long batflags;
+          long unsigned int jiffiestamp = jiffies;
+          
+          msleep(10*1000);
+          rcu_read_lock();
+          spin_lock_irqsave(&batlock,batflags);
+          
+          long unsigned int jiffie_temp = jiffies;
+          if(!(jiffie_temp - jiffiestamp < 5 * HZ))
+               bond_for_each_slave_rcu(bond,slave,i)
+               {
+                    struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+                    if(congest_current->congestion_counter >= 2)
+                         congest_current->congestion_counter/=2;
+               }
+
+          spin_unlock_irqrestore(&batlock,batflags);
+          rcu_read_unlock();
+     }
+}
+
 static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
@@ -1100,7 +1169,67 @@  static rx_handler_result_t bond_handle_f
 	slave = bond_slave_get_rcu(skb->dev);
 	bond = slave->bond;
 
-	recv_probe = ACCESS_ONCE(bond->recv_probe);
+    //This seems as good a place as any to put this.
+    if(skb->protocol == htons(ETH_P_IP))
+    {
+         struct iphdr* iph = ip_hdr(skb);
+         if(iph->protocol == IPPROTO_TCP)
+         {
+              //printk(KERN_EMERG "batman recv handler Chekpoint 1\n");
+              
+              //Headers
+              struct tcphdr* tcp_header = tcp_hdr(skb);
+              u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest)); 
+              u32 ack_seq = ntohl(tcp_header->ack_seq);
+
+              unsigned long batflags;
+              spin_lock_irqsave(&batlock,batflags);
+              
+              //Iterate over hash table, deleting old congestion nodes along the way
+              struct batnode* backptr = NULL;
+              struct batnode* battable_ptr = battable + (port_id % BATTABLE_SIZE);
+              while(battable_ptr && battable_ptr->jiffiestamp)
+              {
+                   if(battable_ptr->port_id == port_id && battable_ptr->tcp_seq_num < ack_seq ||
+                      (jiffies - battable_ptr->jiffiestamp) / HZ > 60)
+                   {
+                        //Special case delete from head
+                        if(!backptr)
+                             if(battable_ptr->next_batnode)
+                             {
+                                  struct batnode* tofree = battable_ptr->next_batnode;
+                                  memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode));
+                                  kfree(tofree);
+                             }
+                             else
+                             {
+                                  battable_ptr->jiffiestamp = 0;
+                                  break;
+                             }
+                        else
+                        {
+                             backptr->next_batnode = battable_ptr->next_batnode;
+                             kfree(battable_ptr);
+                             battable_ptr = backptr->next_batnode;
+                        }
+
+                        continue;
+                   }
+                        
+                   //Go to next in sequence
+                   backptr = battable_ptr;
+                   battable_ptr = battable_ptr->next_batnode;
+              }
+
+              int ignored=0;
+              batman_normalize_congestion(bond,&ignored);
+
+              spin_unlock_irqrestore(&batlock,batflags);
+              //printk(KERN_EMERG "batman recv handler Chekpoint 2\n");
+         }
+    }
+    
+    recv_probe = ACCESS_ONCE(bond->recv_probe);
 	if (recv_probe) {
 		ret = recv_probe(skb, bond, slave);
 		if (ret == RX_HANDLER_CONSUMED) {
@@ -3485,7 +3614,6 @@  static int bond_set_mac_address(struct n
 	if (BOND_MODE(bond) == BOND_MODE_ALB)
 		return bond_alb_set_mac_address(bond_dev, addr);
 
-
 	netdev_dbg(bond_dev, "bond=%p\n", bond);
 
 	/* If fail_over_mac is enabled, do nothing and return success.
@@ -3538,6 +3666,224 @@  unwind:
 	return res;
 }
 
+static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int slave_id);
+
+static int bond_xmit_batman(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave = NULL, *backup_slave = NULL;
+	struct list_head* i;
+    int res = 1;
+	struct iphdr *iph = ip_hdr(skb);
+
+	/*
+	 * Start with the curr_active_slave that joined the bond as the
+	 * default for sending IGMP traffic.  For failover purposes one
+	 * needs to maintain some consistency for the interface that will
+	 * send the join/membership reports.  The curr_active_slave found
+	 * will send all of this type of traffic.
+	 */
+	if ((iph->protocol == IPPROTO_IGMP) &&
+	    (skb->protocol == htons(ETH_P_IP))) {
+
+		slave = rcu_dereference(bond->curr_active_slave);
+		if (slave)
+			bond_dev_queue_xmit(bond, skb, slave->dev);
+		else
+			bond_xmit_slave_id(bond, skb, 0);
+	} else {
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 1\n");
+         
+         u32 congestion = 0;
+         struct slave_congestion_node* congest_node = NULL;
+
+         unsigned long flags;
+         spin_lock_irqsave(&batlock,flags);
+         
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 2\n");
+
+         //Choose slave to send
+         bond_for_each_slave_rcu(bond,slave,i)
+         {
+              struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+              congestion+=congest_current->congestion_counter;
+              if(!congest_node || congest_current->congestion_counter > congest_node->congestion_counter)
+                   congest_node = congest_current;
+         }
+         
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 3\n");
+         
+         //If we're not TCP, we do this
+         if(congest_node)
+              bond_for_each_slave_rcu(bond,slave,i)
+                   if(strcmp(slave->dev->name,congest_node->this_slave)==0)
+                        break;
+         else
+              slave = rcu_dereference(bond->curr_active_slave);
+
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 4\n");
+
+         //If we're TCP, update our congestion table
+         if(skb->protocol == htons(ETH_P_IP) && iph->protocol == IPPROTO_TCP)
+         {
+              //Headers
+              struct tcphdr *tcp_header = tcp_hdr(skb);
+              u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest));
+              int batidx = port_id % BATTABLE_SIZE;
+              u32 seqnum = ntohl(tcp_header->seq);
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 5\n");
+
+              batman_normalize_congestion(bond,&congestion);
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 6\n");
+              
+              //Find slave
+              u32 ratio_pos = jiffies % congestion;
+              u32 walk_pos = 0;
+              bond_for_each_slave_rcu(bond,slave,i)
+              {
+                   struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+                   walk_pos+=congest_current->congestion_counter;
+                   if(walk_pos > ratio_pos)
+                        break;
+                   backup_slave = slave;
+              }
+
+              if(!slave)
+                   slave = backup_slave;
+              if(!slave)
+              {
+                   printk(KERN_DEBUG "HOLY 13TH AMENDMENT, BATMAN!");
+                   goto end;
+              }    
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 7\n");
+              
+              //Add to hash table
+              struct batnode* backptr = NULL;
+              struct batnode* battable_ptr = battable + batidx;
+              //printk(KERN_DEBUG "battable: 0x%zx\nbattable_ptr: 0x%zx", battable, battable_ptr);
+              while(battable_ptr->jiffiestamp)
+              {
+                   if(battable_ptr->port_id==port_id && battable_ptr->tcp_seq_num == seqnum) //penalize slave that sent packet later retransmitted
+                   {
+                        //printk(KERN_DEBUG "Penalizing %s and favoring %s.\n",battable_ptr->slvname,slave->dev->name);
+                        struct slave_congestion_node* sptr = get_slave_congestion_node(battable_ptr->slvname);
+                        if(sptr->congestion_counter > 1 && strcmp(battable_ptr->slvname,slave->dev->name)!=0)
+                             sptr->congestion_counter--;
+                        if(strcmp(battable_ptr->slvname,slave->dev->name)!=0)
+                             slave_congestion_increment(slave->dev->name);
+                        else //never retransmit on same slave, unless we have no choice
+                        {
+                             struct slave* other_slave = slave;
+                             congest_node = NULL;
+                             bond_for_each_slave_rcu(bond,slave,i)
+                             {
+                                  struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name);
+                                  if(strcmp(congest_current->this_slave,other_slave->dev->name)!=0 && (!congest_node || congest_current->congestion_counter > congest_node->congestion_counter))
+                                       congest_node = congest_current;
+                             }
+                             if(congest_node)
+                             {
+                                  bond_for_each_slave_rcu(bond,slave,i)
+                                       if(strcmp(congest_node->this_slave,slave->dev->name)==0)
+                                            break;
+                             }
+                             else
+                                  slave = other_slave;
+                        }
+
+                        battable_ptr->port_id = 0;
+                   }
+
+                   //Delete old failures
+                   if(!battable_ptr->port_id || (jiffies - battable_ptr->jiffiestamp) / HZ > 60)
+                   {                        
+                        //Special case delete from head
+                        if(!backptr)
+                        {
+                             if(battable_ptr->next_batnode)
+                             {
+                                  struct batnode* tofree = battable_ptr->next_batnode;
+                                  memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode));
+                                  kfree(tofree);
+                             }
+                             else
+                             {
+                                  battable_ptr->jiffiestamp = 0;
+                                  break;
+                             }
+
+                             continue;
+                        }
+                        else
+                        {
+                             backptr->next_batnode = battable_ptr->next_batnode;
+                             kfree(battable_ptr);
+                             battable_ptr = backptr;
+                        }
+                   }
+
+                   //Go to next in sequence
+                   if(battable_ptr->next_batnode)
+                   {
+                        backptr = battable_ptr;
+                        battable_ptr = battable_ptr->next_batnode;
+                   }
+                   else
+                   {
+                        struct batnode* empty = (struct batnode*)(kmalloc(sizeof(struct batnode),GFP_ATOMIC));
+                        empty->jiffiestamp = 0;
+                        empty->next_batnode = NULL;
+                        battable_ptr->next_batnode = empty;
+                        backptr = battable_ptr; //We currently won't use backptr again, but just in case.
+                        battable_ptr = empty;
+                   }
+
+                   //printk(KERN_DEBUG "Old failures natural end: battable_ptr: 0x%zx", battable_ptr);
+              }
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 8\n");
+              
+              //Create congestion packet
+              strlcpy(battable_ptr->slvname,slave->dev->name,IFNAMSIZ);
+              battable_ptr->jiffiestamp = jiffies;
+              if(!battable_ptr->jiffiestamp)
+                   battable_ptr->jiffiestamp++;
+              battable_ptr->tcp_seq_num = seqnum;
+              battable_ptr->port_id = port_id;
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 9\n");
+              
+              if(jiffies%100==0)
+              {
+                   struct slave_congestion_node* cur_slave = slavelist_head_ptr;
+                   while(cur_slave)
+                   {
+                        printk("slave 0x%zx: %s has congestion %d\n",
+                               cur_slave,cur_slave->this_slave,cur_slave->congestion_counter);
+                        cur_slave = cur_slave->next_slave;
+                   }
+              }
+
+              //printk(KERN_DEBUG "xmit_batman Checkpoint 10\n");
+         }
+
+         end:
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 11");
+         spin_unlock_irqrestore(&batlock,flags);
+
+         //printk(KERN_DEBUG "xmit_batman Checkpoint 12");
+         if(slave /*&& bond_slave_is_up(slave)*/)
+              bond_dev_queue_xmit(bond, skb, slave->dev);
+         else
+              bond_xmit_slave_id(bond, skb, 0);
+    }
+
+	return NETDEV_TX_OK;
+}
+
 /**
  * bond_xmit_slave_id - transmit skb through slave with slave_id
  * @bond: bonding device that is transmitting
@@ -3896,6 +4242,8 @@  static netdev_tx_t __bond_start_xmit(str
 	switch (BOND_MODE(bond)) {
 	case BOND_MODE_ROUNDROBIN:
 		return bond_xmit_roundrobin(skb, dev);
+    case BOND_MODE_BATMAN:
+         return bond_xmit_batman(skb,dev);
 	case BOND_MODE_ACTIVEBACKUP:
 		return bond_xmit_activebackup(skb, dev);
 	case BOND_MODE_8023AD:
@@ -4020,6 +4368,9 @@  static void bond_destructor(struct net_d
 	if (bond->wq)
 		destroy_workqueue(bond->wq);
 	free_netdev(bond_dev);
+
+    if(bond_mode==BOND_MODE_BATMAN)
+         kthread_stop(bond->battimer_task);
 }
 
 void bond_setup(struct net_device *bond_dev)
@@ -4068,6 +4419,12 @@  void bond_setup(struct net_device *bond_
 	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM);
 	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
 	bond_dev->features |= bond_dev->hw_features;
+
+    if(bond_mode == BOND_MODE_BATMAN)
+    {
+         printk(KERN_INFO "%s: I'm Batman.\n",bond_dev->name);
+         bond->battimer_task = kthread_run(batman_timer_invoke,bond,"%s_batman",bond_dev->name);
+    }    
 }
 
 /* Destroy a bonding device.
@@ -4615,6 +4972,9 @@  err_link:
 
 static void __exit bonding_exit(void)
 {
+    int i;
+    struct slave_congestion_node* next_slave;
+    
 	unregister_netdevice_notifier(&bond_netdev_notifier);
 
 	bond_destroy_debugfs();
@@ -4622,6 +4982,27 @@  static void __exit bonding_exit(void)
 	bond_netlink_fini();
 	unregister_pernet_subsys(&bond_net_ops);
 
+    //Batman doesn't leak memory
+    for(i=0; i<BATTABLE_SIZE; i++)
+    {
+         struct batnode* next_batnode = battable[i].next_batnode;
+         while(next_batnode)
+         {
+              struct batnode* next_next_batnode = next_batnode->next_batnode;
+              kfree(next_batnode);
+              next_batnode = next_next_batnode;
+         }
+    }
+    memset(battable,0,sizeof(struct batnode)*BATTABLE_SIZE);
+
+    next_slave = slavelist_head_ptr;
+    while(next_slave)
+    {
+         struct slave_congestion_node* next_next_slave = next_slave->next_slave;
+         kfree(next_slave);
+         next_slave = next_next_slave;
+    }
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	/* Make sure we don't have an imbalance on our netpoll blocking */
 	WARN_ON(atomic_read(&netpoll_block_tx));
--- tree_a/drivers/net/bonding/bond_options.c	2015-05-06 16:04:23.000000000 -0400
+++ tree_b/drivers/net/bonding/bond_options.c	2015-05-08 23:33:24.850987473 -0400
@@ -80,6 +80,7 @@  static const struct bond_opt_value bond_
 	{ "802.3ad",       BOND_MODE_8023AD,       0},
 	{ "balance-tlb",   BOND_MODE_TLB,          0},
 	{ "balance-alb",   BOND_MODE_ALB,          0},
+    { "batman",        BOND_MODE_BATMAN,       0},
 	{ NULL,            -1,                     0},
 };