Message ID | 555AD3A7.7050202@netscape.net |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: > > I've written a new mode for the kernel bonding driver. It's similar to > the round-robin mode, but it keeps statistics on TCP resends so as to > favor slave devices with more bandwidth when choosing where to send > packets. I've tested it on two laptops using WiFi/Ethernet and it seems > to work okay, but it should be considered experimental. A description of how is the mode supposed to work would be definitely helpful. Below are some comments from a quick look over the patch, I didn't think too much about the logic of the operation. > +struct batnode > +{ > + struct batnode* next_batnode; > + char slvname[IFNAMSIZ]; > + unsigned long jiffiestamp; > + u32 tcp_seq_num; > + u16 port_id; > +}; > + > +struct slave_congestion_node > +{ > + struct slave_congestion_node* next_slave; > + char this_slave[IFNAMSIZ]; > + u32 congestion_counter; > +}; Referencing the slave by its name doesn't seem very efficient, especially if you are going to do it in the xmit handler. > +#define BATTABLE_SIZE 256 > +static struct batnode battable[BATTABLE_SIZE]; > +static struct slave_congestion_node* slavelist_head_ptr = NULL; Why one global list rather than one list per bond? And why do you need your own list in addition to already existing slave structures? > +static struct slave_congestion_node* get_slave_congestion_node(char* slvname) > +{ > + //printk("In get_slave_congestion_node: 0x%zx\n",the_slave); > + struct slave_congestion_node* cur_slave = slavelist_head_ptr; > + while(cur_slave) > + { > + if(strcmp(cur_slave->this_slave,slvname)==0) > + { > + //printk("Found a match: 0x%zx\n",cur_slave); > + return cur_slave; > + } > + > + if(!cur_slave->next_slave) > + break; > + cur_slave = cur_slave->next_slave; > + } > + > + //Create new slave node > + if(!cur_slave) //special case head > + { > + cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); > + slavelist_head_ptr = cur_slave; > + } > + else > + { > + cur_slave->next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); > + cur_slave = cur_slave->next_slave; > + } You don't handle kmalloc() failure. > + > + //Initialize new slave node > + cur_slave->next_slave = NULL; > + strlcpy(cur_slave->this_slave,slvname,IFNAMSIZ); > + cur_slave->congestion_counter = 1; > + > + //printk("Created new congestion node: 0x%zx\n",cur_slave); > + return cur_slave; > +} > + > +static void slave_congestion_increment(char* the_slave) > +{ > + get_slave_congestion_node(the_slave)->congestion_counter++; > +} > + > +static void slave_congestion_decrement(char* the_slave) > +{ > + get_slave_congestion_node(the_slave)->congestion_counter--; > +} Should these three really be in a header file? > +static DEFINE_SPINLOCK(batlock); A lot of effort has been done recently to get rid of per-bond spinlocks. Adding a global one kind of goes against this work. > @@ -1082,6 +1088,69 @@ static bool bond_should_deliver_exact_ma > return false; > } > > +static void batman_normalize_congestion(struct bonding* bond, int* congestion) > +{ > + static long unsigned int jiffiestamp = 0; > + long unsigned int jiffie_temp = jiffies; > + struct slave* slave; > + struct list_head* i; > + if(jiffie_temp!=jiffiestamp && jiffie_temp%1000==0) //every thousand packets, normalize congestion Looks more like every 1000 jiffies, not packets. And what if this function doesn't get called in that one jiffy when jiffies is a multiple of 1000? (Similar problem later in bond_xmit_batman().) > + printk(KERN_DEBUG "batman: congestion normalization has occurred.\n"); Use pr_debug() or netdev_dbg(). > +static int batman_timer_invoke(void* bond_) > +{ > + struct bonding* bond = (struct bonding*)(bond_); > + struct slave* slave; > + struct list_head* i; > + > + while(!kthread_should_stop()) > + { > + unsigned long batflags; > + long unsigned int jiffiestamp = jiffies; > + > + msleep(10*1000); > + rcu_read_lock(); > + spin_lock_irqsave(&batlock,batflags); > + > + long unsigned int jiffie_temp = jiffies; > + if(!(jiffie_temp - jiffiestamp < 5 * HZ)) > + bond_for_each_slave_rcu(bond,slave,i) > + { > + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); > + if(congest_current->congestion_counter >= 2) > + congest_current->congestion_counter/=2; > + } > + > + spin_unlock_irqrestore(&batlock,batflags); > + rcu_read_unlock(); > + } > +} This seems to only run something periodically, why do you need a (per bond) kthread instead of a simple timer? > @@ -1100,7 +1169,67 @@ static rx_handler_result_t bond_handle_f > slave = bond_slave_get_rcu(skb->dev); > bond = slave->bond; > > - recv_probe = ACCESS_ONCE(bond->recv_probe); > + //This seems as good a place as any to put this. > + if(skb->protocol == htons(ETH_P_IP)) > + { ... > + } I would suggest to handle TCP over IPv6 as well. > @@ -4068,6 +4419,12 @@ void bond_setup(struct net_device *bond_ > bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM); > bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; > bond_dev->features |= bond_dev->hw_features; > + > + if(bond_mode == BOND_MODE_BATMAN) > + { > + printk(KERN_INFO "%s: I'm Batman.\n",bond_dev->name); > + bond->battimer_task = kthread_run(batman_timer_invoke,bond,"%s_batman",bond_dev->name); > + } > } > > /* Destroy a bonding device. You only start the kthread in bond_setup() and stop it in bond_destructor(). This doesn't handle the situation when the mode is changed after the bond is created. One general note: the patch breaks kernel coding style in a lot of points. Running checkpatch.pl on it, I got total: 562 errors, 466 warnings, 5 checks, 585 lines checked The script isn't perfect but this is too much. Michal Kubecek -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 5/19/15 2:09 AM, Patrick Simmons wrote: > Hello, > > I've written a new mode for the kernel bonding driver. It's similar to > the round-robin mode, but it keeps statistics on TCP resends so as to > favor slave devices with more bandwidth when choosing where to send > packets. I've tested it on two laptops using WiFi/Ethernet and it seems > to work okay, but it should be considered experimental. > > Still, it should be safe to apply this patch as it doesn't do anything > unless you specify "mode=batman" on the driver command line. > > Signed-off-by: Patrick Simmons <linuxrocks123@netscape.net> > It is usually a good idea to add the bonding maintainers directly, have added them. See <kernel>/MAINTAINERS for list. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/19/2015 03:49 AM, Michal Kubecek wrote: > On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: >> >> I've written a new mode for the kernel bonding driver. It's similar to >> the round-robin mode, but it keeps statistics on TCP resends so as to >> favor slave devices with more bandwidth when choosing where to send >> packets. I've tested it on two laptops using WiFi/Ethernet and it seems >> to work okay, but it should be considered experimental. > > A description of how is the mode supposed to work would be definitely > helpful. > Rationale: It's helpful for cases where the slave devices have significantly different or varying bandwidth. The reason I wrote it is to bond powerline networking and wireless networking adapters into a single interface for use with connecting to a MythTV server. Neither of these systems is particularly reliable with bandwidth, but mode=batman can adaptively figure out which network has more available bandwidth at any given moment. This is better than mode=round-robin which always balances everything 50/50. Regarding your analysis, I appreciate your comments, and I know it's rough, but I'm sorry to say I'm not really interested in doing much to improve its polish past where it is. If it fails some way when I try to deploy it, then I'll fix that, and maybe I'll play around with the balancing heuristics, but the code quality is what it is unless someone else wants to improve it. I would fix the indentation if that would make it acceptable for you to merge it, but not much more. My argument for merging it is basically "it doesn't do anything unless you pass mode=batman, so what's the harm?". So, if you guys decide you don't want to merge it because of the global spinlock etc., that's cool and I understand, but I thought I should at least post to this list so you and any other potentially interested people know it exists. Oh, and, if you're not going to merge it, please let me know so I can know post the patch to GitHub or somewhere. And, if you could include a note in the comments at the top of bond_main.c or somewhere pointing people to the patch, I'd very much appreciate that. I don't want anyone else to have to endure hours of kernel rebuilds with KASAN enabled if they want this functionality :) Cheers and have a good one! --Patrick
On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote: > On 05/19/2015 03:49 AM, Michal Kubecek wrote: > >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: > >> > >>I've written a new mode for the kernel bonding driver. It's similar to > >>the round-robin mode, but it keeps statistics on TCP resends so as to > >>favor slave devices with more bandwidth when choosing where to send > >>packets. I've tested it on two laptops using WiFi/Ethernet and it seems > >>to work okay, but it should be considered experimental. > > > >A description of how is the mode supposed to work would be definitely > >helpful. > > > > Rationale: It's helpful for cases where the slave devices have > significantly different or varying bandwidth. The reason I wrote it > is to bond powerline networking and wireless networking adapters > into a single interface for use with connecting to a MythTV server. > Neither of these systems is particularly reliable with bandwidth, > but mode=batman can adaptively figure out which network has more > available bandwidth at any given moment. This is better than > mode=round-robin which always balances everything 50/50. Thank you. But I rather meant some basic description of the algorithm used to achieve this goal. Both should be IMHO part of the commit message. > Regarding your analysis, I appreciate your comments, and I know it's > rough, but I'm sorry to say I'm not really interested in doing much > to improve its polish past where it is. If it fails some way when I > try to deploy it, then I'll fix that, and maybe I'll play around > with the balancing heuristics, but the code quality is what it is > unless someone else wants to improve it. I would fix the > indentation if that would make it acceptable for you to merge it, > but not much more. My argument for merging it is basically "it > doesn't do anything unless you pass mode=batman, so what's the > harm?". > > So, if you guys decide you don't want to merge it because of the > global spinlock etc., that's cool and I understand, but I thought I > should at least post to this list so you and any other potentially > interested people know it exists. Oh, and, if you're not going to > merge it, please let me know so I can know post the patch to GitHub > or somewhere. And, if you could include a note in the comments at > the top of bond_main.c or somewhere pointing people to the patch, > I'd very much appreciate that. I don't want anyone else to have to > endure hours of kernel rebuilds with KASAN enabled if they want this > functionality :) Well, it's not my call, I'm not a bonding maintainer. But I believe at least some of the objections would be shared by them. Of course, it's up to you if you want to dedicate your time to improving the code to be acceptable for mainline or rather maintain it out of tree (which may end up taking even more time in the long term). Michal Kubecek -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michal Kubecek <mkubecek@suse.cz> wrote: >On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote: >> On 05/19/2015 03:49 AM, Michal Kubecek wrote: >> >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: >> >> >> >>I've written a new mode for the kernel bonding driver. It's similar to >> >>the round-robin mode, but it keeps statistics on TCP resends so as to >> >>favor slave devices with more bandwidth when choosing where to send >> >>packets. I've tested it on two laptops using WiFi/Ethernet and it seems >> >>to work okay, but it should be considered experimental. >> > >> >A description of how is the mode supposed to work would be definitely >> >helpful. >> > >> >> Rationale: It's helpful for cases where the slave devices have >> significantly different or varying bandwidth. The reason I wrote it >> is to bond powerline networking and wireless networking adapters >> into a single interface for use with connecting to a MythTV server. >> Neither of these systems is particularly reliable with bandwidth, >> but mode=batman can adaptively figure out which network has more >> available bandwidth at any given moment. This is better than >> mode=round-robin which always balances everything 50/50. > >Thank you. But I rather meant some basic description of the algorithm >used to achieve this goal. Both should be IMHO part of the commit >message. Agreed; the concept sounds interesting, but without a detailed description of how it works it is difficult to evaluate its value. >> Regarding your analysis, I appreciate your comments, and I know it's >> rough, but I'm sorry to say I'm not really interested in doing much >> to improve its polish past where it is. If it fails some way when I >> try to deploy it, then I'll fix that, and maybe I'll play around >> with the balancing heuristics, but the code quality is what it is >> unless someone else wants to improve it. I would fix the >> indentation if that would make it acceptable for you to merge it, >> but not much more. My argument for merging it is basically "it >> doesn't do anything unless you pass mode=batman, so what's the >> harm?". >> >> So, if you guys decide you don't want to merge it because of the >> global spinlock etc., that's cool and I understand, but I thought I >> should at least post to this list so you and any other potentially >> interested people know it exists. Oh, and, if you're not going to >> merge it, please let me know so I can know post the patch to GitHub >> or somewhere. And, if you could include a note in the comments at >> the top of bond_main.c or somewhere pointing people to the patch, >> I'd very much appreciate that. I don't want anyone else to have to >> endure hours of kernel rebuilds with KASAN enabled if they want this >> functionality :) > >Well, it's not my call, I'm not a bonding maintainer. But I believe at >least some of the objections would be shared by them. Of course, it's up >to you if you want to dedicate your time to improving the code to be >acceptable for mainline or rather maintain it out of tree (which may end >up taking even more time in the long term). Well, I am a bonding maintainer, and I can say that the patch in its current state is not suitable for inclusion. At a minimum, there are many coding style issues, commented out debug statements, etc, along with design issues (e.g., the batman mode handling in bond_handle_frame is unconditional and takes place for all modes, not just the new batman mode). If you (Patrick) or someone else wishes to contribute this to mainline, I'd suggest that the first step is to read and follow the instructions in Documentation/SubmittingPatches in the kernel source code. It is also not feasible to add pointers in the kernel source code to out-of-tree patches; sorry. -J --- -Jay Vosburgh, jay.vosburgh@canonical.com -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 20, 2015 at 10:45:33AM -0700, Jay Vosburgh wrote: > Michal Kubecek <mkubecek@suse.cz> wrote: > > >On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote: > >> On 05/19/2015 03:49 AM, Michal Kubecek wrote: > >> >On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: > >> >> > >> >>I've written a new mode for the kernel bonding driver. It's similar to > >> >>the round-robin mode, but it keeps statistics on TCP resends so as to > >> >>favor slave devices with more bandwidth when choosing where to send > >> >>packets. I've tested it on two laptops using WiFi/Ethernet and it seems > >> >>to work okay, but it should be considered experimental. > >> > > >> >A description of how is the mode supposed to work would be definitely > >> >helpful. > >> > > >> > >> Rationale: It's helpful for cases where the slave devices have > >> significantly different or varying bandwidth. The reason I wrote it > >> is to bond powerline networking and wireless networking adapters > >> into a single interface for use with connecting to a MythTV server. > >> Neither of these systems is particularly reliable with bandwidth, > >> but mode=batman can adaptively figure out which network has more > >> available bandwidth at any given moment. This is better than > >> mode=round-robin which always balances everything 50/50. > > > >Thank you. But I rather meant some basic description of the algorithm > >used to achieve this goal. Both should be IMHO part of the commit > >message. > > Agreed; the concept sounds interesting, but without a detailed > description of how it works it is difficult to evaluate its value. > > >> Regarding your analysis, I appreciate your comments, and I know it's > >> rough, but I'm sorry to say I'm not really interested in doing much > >> to improve its polish past where it is. If it fails some way when I > >> try to deploy it, then I'll fix that, and maybe I'll play around > >> with the balancing heuristics, but the code quality is what it is > >> unless someone else wants to improve it. I would fix the > >> indentation if that would make it acceptable for you to merge it, > >> but not much more. My argument for merging it is basically "it > >> doesn't do anything unless you pass mode=batman, so what's the > >> harm?". > >> > >> So, if you guys decide you don't want to merge it because of the > >> global spinlock etc., that's cool and I understand, but I thought I > >> should at least post to this list so you and any other potentially > >> interested people know it exists. Oh, and, if you're not going to > >> merge it, please let me know so I can know post the patch to GitHub > >> or somewhere. And, if you could include a note in the comments at > >> the top of bond_main.c or somewhere pointing people to the patch, > >> I'd very much appreciate that. I don't want anyone else to have to > >> endure hours of kernel rebuilds with KASAN enabled if they want this > >> functionality :) > > > >Well, it's not my call, I'm not a bonding maintainer. But I believe at > >least some of the objections would be shared by them. Of course, it's up > >to you if you want to dedicate your time to improving the code to be > >acceptable for mainline or rather maintain it out of tree (which may end > >up taking even more time in the long term). > > Well, I am a bonding maintainer, and I can say that the patch in > its current state is not suitable for inclusion. > > At a minimum, there are many coding style issues, commented out > debug statements, etc, along with design issues (e.g., the batman mode > handling in bond_handle_frame is unconditional and takes place for all > modes, not just the new batman mode). > > If you (Patrick) or someone else wishes to contribute this to > mainline, I'd suggest that the first step is to read and follow the > instructions in Documentation/SubmittingPatches in the kernel source > code. > > It is also not feasible to add pointers in the kernel source > code to out-of-tree patches; sorry. Well I can now delete most of my initial response. :) Overall I would say this is really cool functionality. Even if you do not want it merged, I think it is great that you shared it with the community this way. I got a chance to look at is a bit this morning and I agree additional explanation of the algorithm you are using would probably be nice for those checking this out for the first time. I also think if you would be able to leverage the exiting bonding infra for using skb->queue_mapping you could probably add the same functionality (though it might be higher in the stack), but I totally understand if you want to just keep using what you are using as-is. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/20/2015 01:45 PM, Jay Vosburgh wrote: > Michal Kubecek <mkubecek@suse.cz> wrote: > >> On Tue, May 19, 2015 at 04:29:45AM -0400, Patrick Simmons wrote: >>> On 05/19/2015 03:49 AM, Michal Kubecek wrote: >>>> On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: >>>>> >>>>> I've written a new mode for the kernel bonding driver. It's similar to >>>>> the round-robin mode, but it keeps statistics on TCP resends so as to >>>>> favor slave devices with more bandwidth when choosing where to send >>>>> packets. I've tested it on two laptops using WiFi/Ethernet and it seems >>>>> to work okay, but it should be considered experimental. >>>> >>>> A description of how is the mode supposed to work would be definitely >>>> helpful. >>>> >>> >>> Rationale: It's helpful for cases where the slave devices have >>> significantly different or varying bandwidth. The reason I wrote it >>> is to bond powerline networking and wireless networking adapters >>> into a single interface for use with connecting to a MythTV server. >>> Neither of these systems is particularly reliable with bandwidth, >>> but mode=batman can adaptively figure out which network has more >>> available bandwidth at any given moment. This is better than >>> mode=round-robin which always balances everything 50/50. >> >> Thank you. But I rather meant some basic description of the algorithm >> used to achieve this goal. Both should be IMHO part of the commit >> message. > > Agreed; the concept sounds interesting, but without a detailed > description of how it works it is difficult to evaluate its value. > The basic algorithm is that each device starts out with a single "point". When a packet drop occurs and another device has been selected to retransmit the packet, that device gets another point and, if the first device has more than one point, the first device (the one that failed to successfully get the packet across) loses a point. At every packet send, all the points of all devices are added up, and each device has a (num_points)/(sum of all points) chance of being chosen to send the packet. Every 10 seconds, all devices points are halved (unless the device only has one point). If the bonding device is approximately idle for 60 seconds, all devices' points are reset to 1 and the algorithm starts over. Additionally, no slave ever retransmits a packet it itself sent unless it is the only active slave; but, if the slave WOULD HAVE retransmitted its own packet according to the algorithm, it doesn't lose any points and the other slave doesn't get any. >>> Regarding your analysis, I appreciate your comments, and I know it's >>> rough, but I'm sorry to say I'm not really interested in doing much >>> to improve its polish past where it is. If it fails some way when I >>> try to deploy it, then I'll fix that, and maybe I'll play around >>> with the balancing heuristics, but the code quality is what it is >>> unless someone else wants to improve it. I would fix the >>> indentation if that would make it acceptable for you to merge it, >>> but not much more. My argument for merging it is basically "it >>> doesn't do anything unless you pass mode=batman, so what's the >>> harm?". >>> >>> So, if you guys decide you don't want to merge it because of the >>> global spinlock etc., that's cool and I understand, but I thought I >>> should at least post to this list so you and any other potentially >>> interested people know it exists. Oh, and, if you're not going to >>> merge it, please let me know so I can know post the patch to GitHub >>> or somewhere. And, if you could include a note in the comments at >>> the top of bond_main.c or somewhere pointing people to the patch, >>> I'd very much appreciate that. I don't want anyone else to have to >>> endure hours of kernel rebuilds with KASAN enabled if they want this >>> functionality :) >> >> Well, it's not my call, I'm not a bonding maintainer. But I believe at >> least some of the objections would be shared by them. Of course, it's up >> to you if you want to dedicate your time to improving the code to be >> acceptable for mainline or rather maintain it out of tree (which may end >> up taking even more time in the long term). It'll probably just bit-rot :( The next time it gets looked at will probably be in 4-5 years when one of the MythTV computers dies and I have to replace it and use a newer kernel because it has new hardware that isn't supported. But it is now and will remain indefinitely at http://moongate.ydns.eu/bond_mode_batman.patch for anyone who wants it. > > Well, I am a bonding maintainer, and I can say that the patch in > its current state is not suitable for inclusion. > > At a minimum, there are many coding style issues, commented out > debug statements, etc, along with design issues (e.g., the batman mode > handling in bond_handle_frame is unconditional and takes place for all > modes, not just the new batman mode). > > If you (Patrick) or someone else wishes to contribute this to > mainline, I'd suggest that the first step is to read and follow the > instructions in Documentation/SubmittingPatches in the kernel source > code. > > It is also not feasible to add pointers in the kernel source > code to out-of-tree patches; sorry. > > -J > > --- > -Jay Vosburgh, jay.vosburgh@canonical.com > Fair enough. I actually did read SubmittingPatches already, but thank you for pointing it out. I could easily fix the coding style issues (I imagine someone's written a "Linux kernel indentation mode" for emacs by now), and I could of course put an if statement around the stuff in bond_handle_frame (that's a bug). What I wouldn't be interested in doing would be things like getting rid of the mode-specific spinlocks, which someone mentioned earlier as a potential issue. Basically, I'll do bug fixes etc. if that would make it suitable, but not anything like design changes that would be likely to make me have to debug the thing again. So, I guess my question is, would design changes be necessary, or would it be suitable if I fixed the indentation, cleaned up the debug comments, fixed the bug in bond_handle_frame, and did something to handle kmalloc failures? By the way, I do appreciate the time you all have taken to look at this, and thank you for CCing me and please continue to do so because I didn't realize the list was so heavy traffic, and so I unsubscribed. --Patrick Simmons
On 05/20/2015 04:04 PM, Andy Gospodarek wrote: > > Well I can now delete most of my initial response. :) > > Overall I would say this is really cool functionality. Even if you do > not want it merged, I think it is great that you shared it with the > community this way. I got a chance to look at is a bit this morning and > I agree additional explanation of the algorithm you are using would > probably be nice for those checking this out for the first time. Thanks :) I explained the algorithm in my last email to the list, so I won't repeat it here. > > I also think if you would be able to leverage the exiting bonding infra > for using skb->queue_mapping you could probably add the same > functionality (though it might be higher in the stack), but I totally > understand if you want to just keep using what you are using as-is. > > I didn't know about the queue_mapping thing. Thanks for the pointer. One thing I considered doing that I didn't ultimately do was try to implement this as a TUN device. Userspace coding is always easier and, in retrospect, finding some way to manually parse IP and TCP packet headers would likely have been trying to debug kernel memory corruption bugs. But another downside would have been that I'm pretty sure all the slaves would have had to have been set to promiscuous mode to make it work. --Patrick Simmons
--- tree_a/include/uapi/linux/if_bonding.h 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/include/uapi/linux/if_bonding.h 2015-05-18 14:49:16.916558030 -0400 @@ -70,6 +70,7 @@ #define BOND_MODE_8023AD 4 #define BOND_MODE_TLB 5 #define BOND_MODE_ALB 6 /* TLB + RLB (receive load balancing) */ +#define BOND_MODE_BATMAN 7 /* each slave's link has 4 states */ #define BOND_LINK_UP 0 /* link is up and running */ --- tree_a/include/net/bonding.h 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/include/net/bonding.h 2015-05-18 16:54:46.452900354 -0400 @@ -15,6 +15,7 @@ #ifndef _NET_BONDING_H #define _NET_BONDING_H +#include <linux/kthread.h> #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> @@ -241,6 +242,7 @@ struct bonding { struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; + struct task_struct* battimer_task; }; #define bond_slave_get_rcu(dev) \ @@ -650,6 +652,22 @@ static inline int bond_get_targets_ip(__ return -1; } +struct batnode +{ + struct batnode* next_batnode; + char slvname[IFNAMSIZ]; + unsigned long jiffiestamp; + u32 tcp_seq_num; + u16 port_id; +}; + +struct slave_congestion_node +{ + struct slave_congestion_node* next_slave; + char this_slave[IFNAMSIZ]; + u32 congestion_counter; +}; + /* exported from bond_main.c */ extern int bond_net_id; extern const struct bond_parm_tbl bond_lacp_tbl[]; @@ -660,6 +678,58 @@ extern const struct bond_parm_tbl fail_o extern const struct bond_parm_tbl pri_reselect_tbl[]; extern struct bond_parm_tbl ad_select_tbl[]; +#define BATTABLE_SIZE 256 +static struct batnode battable[BATTABLE_SIZE]; +static struct slave_congestion_node* slavelist_head_ptr = NULL; + +static struct slave_congestion_node* get_slave_congestion_node(char* slvname) +{ + //printk("In get_slave_congestion_node: 0x%zx\n",the_slave); + struct slave_congestion_node* cur_slave = slavelist_head_ptr; + while(cur_slave) + { + if(strcmp(cur_slave->this_slave,slvname)==0) + { + //printk("Found a match: 0x%zx\n",cur_slave); + return cur_slave; + } + + if(!cur_slave->next_slave) + break; + cur_slave = cur_slave->next_slave; + } + + //Create new slave node + if(!cur_slave) //special case head + { + cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + slavelist_head_ptr = cur_slave; + } + else + { + cur_slave->next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + cur_slave = cur_slave->next_slave; + } + + //Initialize new slave node + cur_slave->next_slave = NULL; + strlcpy(cur_slave->this_slave,slvname,IFNAMSIZ); + cur_slave->congestion_counter = 1; + + //printk("Created new congestion node: 0x%zx\n",cur_slave); + return cur_slave; +} + +static void slave_congestion_increment(char* the_slave) +{ + get_slave_congestion_node(the_slave)->congestion_counter++; +} + +static void slave_congestion_decrement(char* the_slave) +{ + get_slave_congestion_node(the_slave)->congestion_counter--; +} + /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; --- tree_a/drivers/net/bonding/bond_main.c 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/drivers/net/bonding/bond_main.c 2015-05-18 20:17:46.914627467 -0400 @@ -71,6 +71,8 @@ #include <linux/if_bonding.h> #include <linux/jiffies.h> #include <linux/preempt.h> +#include <linux/spinlock_types.h> +#include <linux/spinlock.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -135,7 +137,7 @@ module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " - "6 for balance-alb"); + "6 for balance-alb, 7 for batman"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); @@ -205,6 +207,8 @@ static int bond_mode = BOND_MODE_ROUNDRO static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; static int lacp_fast; +static DEFINE_SPINLOCK(batlock); + /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); @@ -225,9 +229,10 @@ const char *bond_mode_name(int mode) [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", + [BOND_MODE_BATMAN] = "load balancing (Batman)", }; - if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) + if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_BATMAN) return "unknown"; return names[mode]; @@ -856,7 +861,8 @@ void bond_change_active_slave(struct bon */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || - BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { + BOND_MODE(bond) == BOND_MODE_ROUNDROBIN || + BOND_MODE(bond) == BOND_MODE_BATMAN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } @@ -1082,6 +1088,69 @@ static bool bond_should_deliver_exact_ma return false; } +static void batman_normalize_congestion(struct bonding* bond, int* congestion) +{ + static long unsigned int jiffiestamp = 0; + long unsigned int jiffie_temp = jiffies; + struct slave* slave; + struct list_head* i; + if(jiffie_temp!=jiffiestamp && jiffie_temp%1000==0) //every thousand packets, normalize congestion + { + printk(KERN_DEBUG "batman: congestion normalization has occurred.\n"); + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(congest_current->congestion_counter > 1) + { + (*congestion)--; + congest_current->congestion_counter--; + } + } + jiffiestamp = jiffie_temp; + } + else if((jiffie_temp - jiffiestamp) / HZ > 60) + { + printk(KERN_DEBUG "batman: failsafe reset invoked: %lu, %lu, %lu\n",jiffie_temp, jiffiestamp, (jiffie_temp - jiffiestamp) / HZ); + *congestion = 0; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + congest_current->congestion_counter = 1; + (*congestion)++; + } + jiffiestamp = jiffie_temp; + } +} + +static int batman_timer_invoke(void* bond_) +{ + struct bonding* bond = (struct bonding*)(bond_); + struct slave* slave; + struct list_head* i; + + while(!kthread_should_stop()) + { + unsigned long batflags; + long unsigned int jiffiestamp = jiffies; + + msleep(10*1000); + rcu_read_lock(); + spin_lock_irqsave(&batlock,batflags); + + long unsigned int jiffie_temp = jiffies; + if(!(jiffie_temp - jiffiestamp < 5 * HZ)) + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(congest_current->congestion_counter >= 2) + congest_current->congestion_counter/=2; + } + + spin_unlock_irqrestore(&batlock,batflags); + rcu_read_unlock(); + } +} + static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; @@ -1100,7 +1169,67 @@ static rx_handler_result_t bond_handle_f slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; - recv_probe = ACCESS_ONCE(bond->recv_probe); + //This seems as good a place as any to put this. + if(skb->protocol == htons(ETH_P_IP)) + { + struct iphdr* iph = ip_hdr(skb); + if(iph->protocol == IPPROTO_TCP) + { + //printk(KERN_EMERG "batman recv handler Chekpoint 1\n"); + + //Headers + struct tcphdr* tcp_header = tcp_hdr(skb); + u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest)); + u32 ack_seq = ntohl(tcp_header->ack_seq); + + unsigned long batflags; + spin_lock_irqsave(&batlock,batflags); + + //Iterate over hash table, deleting old congestion nodes along the way + struct batnode* backptr = NULL; + struct batnode* battable_ptr = battable + (port_id % BATTABLE_SIZE); + while(battable_ptr && battable_ptr->jiffiestamp) + { + if(battable_ptr->port_id == port_id && battable_ptr->tcp_seq_num < ack_seq || + (jiffies - battable_ptr->jiffiestamp) / HZ > 60) + { + //Special case delete from head + if(!backptr) + if(battable_ptr->next_batnode) + { + struct batnode* tofree = battable_ptr->next_batnode; + memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode)); + kfree(tofree); + } + else + { + battable_ptr->jiffiestamp = 0; + break; + } + else + { + backptr->next_batnode = battable_ptr->next_batnode; + kfree(battable_ptr); + battable_ptr = backptr->next_batnode; + } + + continue; + } + + //Go to next in sequence + backptr = battable_ptr; + battable_ptr = battable_ptr->next_batnode; + } + + int ignored=0; + batman_normalize_congestion(bond,&ignored); + + spin_unlock_irqrestore(&batlock,batflags); + //printk(KERN_EMERG "batman recv handler Chekpoint 2\n"); + } + } + + recv_probe = ACCESS_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { @@ -3485,7 +3614,6 @@ static int bond_set_mac_address(struct n if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); - netdev_dbg(bond_dev, "bond=%p\n", bond); /* If fail_over_mac is enabled, do nothing and return success. @@ -3538,6 +3666,224 @@ unwind: return res; } +static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int slave_id); + +static int bond_xmit_batman(struct sk_buff *skb, struct net_device *bond_dev) +{ + struct bonding *bond = netdev_priv(bond_dev); + struct slave *slave = NULL, *backup_slave = NULL; + struct list_head* i; + int res = 1; + struct iphdr *iph = ip_hdr(skb); + + /* + * Start with the curr_active_slave that joined the bond as the + * default for sending IGMP traffic. For failover purposes one + * needs to maintain some consistency for the interface that will + * send the join/membership reports. The curr_active_slave found + * will send all of this type of traffic. + */ + if ((iph->protocol == IPPROTO_IGMP) && + (skb->protocol == htons(ETH_P_IP))) { + + slave = rcu_dereference(bond->curr_active_slave); + if (slave) + bond_dev_queue_xmit(bond, skb, slave->dev); + else + bond_xmit_slave_id(bond, skb, 0); + } else { + //printk(KERN_DEBUG "xmit_batman Checkpoint 1\n"); + + u32 congestion = 0; + struct slave_congestion_node* congest_node = NULL; + + unsigned long flags; + spin_lock_irqsave(&batlock,flags); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 2\n"); + + //Choose slave to send + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + congestion+=congest_current->congestion_counter; + if(!congest_node || congest_current->congestion_counter > congest_node->congestion_counter) + congest_node = congest_current; + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 3\n"); + + //If we're not TCP, we do this + if(congest_node) + bond_for_each_slave_rcu(bond,slave,i) + if(strcmp(slave->dev->name,congest_node->this_slave)==0) + break; + else + slave = rcu_dereference(bond->curr_active_slave); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 4\n"); + + //If we're TCP, update our congestion table + if(skb->protocol == htons(ETH_P_IP) && iph->protocol == IPPROTO_TCP) + { + //Headers + struct tcphdr *tcp_header = tcp_hdr(skb); + u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest)); + int batidx = port_id % BATTABLE_SIZE; + u32 seqnum = ntohl(tcp_header->seq); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 5\n"); + + batman_normalize_congestion(bond,&congestion); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 6\n"); + + //Find slave + u32 ratio_pos = jiffies % congestion; + u32 walk_pos = 0; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + walk_pos+=congest_current->congestion_counter; + if(walk_pos > ratio_pos) + break; + backup_slave = slave; + } + + if(!slave) + slave = backup_slave; + if(!slave) + { + printk(KERN_DEBUG "HOLY 13TH AMENDMENT, BATMAN!"); + goto end; + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 7\n"); + + //Add to hash table + struct batnode* backptr = NULL; + struct batnode* battable_ptr = battable + batidx; + //printk(KERN_DEBUG "battable: 0x%zx\nbattable_ptr: 0x%zx", battable, battable_ptr); + while(battable_ptr->jiffiestamp) + { + if(battable_ptr->port_id==port_id && battable_ptr->tcp_seq_num == seqnum) //penalize slave that sent packet later retransmitted + { + //printk(KERN_DEBUG "Penalizing %s and favoring %s.\n",battable_ptr->slvname,slave->dev->name); + struct slave_congestion_node* sptr = get_slave_congestion_node(battable_ptr->slvname); + if(sptr->congestion_counter > 1 && strcmp(battable_ptr->slvname,slave->dev->name)!=0) + sptr->congestion_counter--; + if(strcmp(battable_ptr->slvname,slave->dev->name)!=0) + slave_congestion_increment(slave->dev->name); + else //never retransmit on same slave, unless we have no choice + { + struct slave* other_slave = slave; + congest_node = NULL; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(strcmp(congest_current->this_slave,other_slave->dev->name)!=0 && (!congest_node || congest_current->congestion_counter > congest_node->congestion_counter)) + congest_node = congest_current; + } + if(congest_node) + { + bond_for_each_slave_rcu(bond,slave,i) + if(strcmp(congest_node->this_slave,slave->dev->name)==0) + break; + } + else + slave = other_slave; + } + + battable_ptr->port_id = 0; + } + + //Delete old failures + if(!battable_ptr->port_id || (jiffies - battable_ptr->jiffiestamp) / HZ > 60) + { + //Special case delete from head + if(!backptr) + { + if(battable_ptr->next_batnode) + { + struct batnode* tofree = battable_ptr->next_batnode; + memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode)); + kfree(tofree); + } + else + { + battable_ptr->jiffiestamp = 0; + break; + } + + continue; + } + else + { + backptr->next_batnode = battable_ptr->next_batnode; + kfree(battable_ptr); + battable_ptr = backptr; + } + } + + //Go to next in sequence + if(battable_ptr->next_batnode) + { + backptr = battable_ptr; + battable_ptr = battable_ptr->next_batnode; + } + else + { + struct batnode* empty = (struct batnode*)(kmalloc(sizeof(struct batnode),GFP_ATOMIC)); + empty->jiffiestamp = 0; + empty->next_batnode = NULL; + battable_ptr->next_batnode = empty; + backptr = battable_ptr; //We currently won't use backptr again, but just in case. + battable_ptr = empty; + } + + //printk(KERN_DEBUG "Old failures natural end: battable_ptr: 0x%zx", battable_ptr); + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 8\n"); + + //Create congestion packet + strlcpy(battable_ptr->slvname,slave->dev->name,IFNAMSIZ); + battable_ptr->jiffiestamp = jiffies; + if(!battable_ptr->jiffiestamp) + battable_ptr->jiffiestamp++; + battable_ptr->tcp_seq_num = seqnum; + battable_ptr->port_id = port_id; + + //printk(KERN_DEBUG "xmit_batman Checkpoint 9\n"); + + if(jiffies%100==0) + { + struct slave_congestion_node* cur_slave = slavelist_head_ptr; + while(cur_slave) + { + printk("slave 0x%zx: %s has congestion %d\n", + cur_slave,cur_slave->this_slave,cur_slave->congestion_counter); + cur_slave = cur_slave->next_slave; + } + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 10\n"); + } + + end: + //printk(KERN_DEBUG "xmit_batman Checkpoint 11"); + spin_unlock_irqrestore(&batlock,flags); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 12"); + if(slave /*&& bond_slave_is_up(slave)*/) + bond_dev_queue_xmit(bond, skb, slave->dev); + else + bond_xmit_slave_id(bond, skb, 0); + } + + return NETDEV_TX_OK; +} + /** * bond_xmit_slave_id - transmit skb through slave with slave_id * @bond: bonding device that is transmitting @@ -3896,6 +4242,8 @@ static netdev_tx_t __bond_start_xmit(str switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); + case BOND_MODE_BATMAN: + return bond_xmit_batman(skb,dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: @@ -4020,6 +4368,9 @@ static void bond_destructor(struct net_d if (bond->wq) destroy_workqueue(bond->wq); free_netdev(bond_dev); + + if(bond_mode==BOND_MODE_BATMAN) + kthread_stop(bond->battimer_task); } void bond_setup(struct net_device *bond_dev) @@ -4068,6 +4419,12 @@ void bond_setup(struct net_device *bond_ bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM); bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; + + if(bond_mode == BOND_MODE_BATMAN) + { + printk(KERN_INFO "%s: I'm Batman.\n",bond_dev->name); + bond->battimer_task = kthread_run(batman_timer_invoke,bond,"%s_batman",bond_dev->name); + } } /* Destroy a bonding device. @@ -4615,6 +4972,9 @@ err_link: static void __exit bonding_exit(void) { + int i; + struct slave_congestion_node* next_slave; + unregister_netdevice_notifier(&bond_netdev_notifier); bond_destroy_debugfs(); @@ -4622,6 +4982,27 @@ static void __exit bonding_exit(void) bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); + //Batman doesn't leak memory + for(i=0; i<BATTABLE_SIZE; i++) + { + struct batnode* next_batnode = battable[i].next_batnode; + while(next_batnode) + { + struct batnode* next_next_batnode = next_batnode->next_batnode; + kfree(next_batnode); + next_batnode = next_next_batnode; + } + } + memset(battable,0,sizeof(struct batnode)*BATTABLE_SIZE); + + next_slave = slavelist_head_ptr; + while(next_slave) + { + struct slave_congestion_node* next_next_slave = next_slave->next_slave; + kfree(next_slave); + next_slave = next_next_slave; + } + #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); --- tree_a/drivers/net/bonding/bond_options.c 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/drivers/net/bonding/bond_options.c 2015-05-08 23:33:24.850987473 -0400 @@ -80,6 +80,7 @@ static const struct bond_opt_value bond_ { "802.3ad", BOND_MODE_8023AD, 0}, { "balance-tlb", BOND_MODE_TLB, 0}, { "balance-alb", BOND_MODE_ALB, 0}, + { "batman", BOND_MODE_BATMAN, 0}, { NULL, -1, 0}, };
Hello, I've written a new mode for the kernel bonding driver. It's similar to the round-robin mode, but it keeps statistics on TCP resends so as to favor slave devices with more bandwidth when choosing where to send packets. I've tested it on two laptops using WiFi/Ethernet and it seems to work okay, but it should be considered experimental. Still, it should be safe to apply this patch as it doesn't do anything unless you specify "mode=batman" on the driver command line. Signed-off-by: Patrick Simmons <linuxrocks123@netscape.net>