diff mbox

[v4,kernel,version,3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP

Message ID 3900291.2831326843194846.JavaMail.root@5-MeO-DMT.ynet.sk
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Štefan Gula Jan. 17, 2012, 11:33 p.m. UTC
From: Stefan Gula <steweg@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <steweg@gmail.com>

---

code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct, orig_source line was moved before pskb_may_pull

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Štefan Gula Jan. 23, 2012, 1:12 p.m. UTC | #1
2012/1/18 Stefan Gula <steweg@ynet.sk>:
> From: Stefan Gula <steweg@gmail.com>
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <steweg@gmail.com>
>
> ---
>
> code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct, orig_source line was moved before pskb_may_pull
>
> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
> --- linux-3.2.1-orig/include/net/ipip.h 2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/include/net/ipip.h   2012-01-16 11:17:01.000000000 +0100
> @@ -27,6 +27,14 @@ struct ip_tunnel {
>        __u32                   o_seqno;        /* The last output seqno */
>        int                     hlen;           /* Precalculated GRE header length */
>        int                     mlink;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#define GRETAP_BR_HASH_BITS 8
> +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
> +       struct hlist_head       hash[GRETAP_BR_HASH_SIZE];
> +       spinlock_t              hash_lock;
> +       unsigned long           ageing_time;
> +       struct timer_list       gc_timer;
> +#endif
>
>        struct ip_tunnel_parm   parms;
>
> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
> --- linux-3.2.1-orig/net/ipv4/Kconfig   2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/net/ipv4/Kconfig     2012-01-16 12:37:00.000000000 +0100
> @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
>          Network), but can be distributed all over the Internet. If you want
>          to do that, say Y here and to "IP multicast routing" below.
>
> +config NET_IPGRE_BRIDGE
> +       bool "IP: Ethernet over multipoint GRE over IP"
> +       depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
> +       help
> +         Allows you to use multipoint GRE VPN as virtual switch and interconnect
> +         several L2 endpoints over L3 routed infrastructure. It is useful for
> +         creating multipoint L2 VPNs which can be later used inside bridge
> +         interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
> +
>  config IP_MROUTE
>        bool "IP: multicast routing"
>        depends on IP_MULTICAST
> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
> --- linux-3.2.1-orig/net/ipv4/ip_gre.c  2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/net/ipv4/ip_gre.c    2012-01-18 00:33:55.000000000 +0100
> @@ -52,6 +52,11 @@
>  #include <net/ip6_route.h>
>  #endif
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#include <linux/jhash.h>
> +#include <asm/unaligned.h>
> +#endif
> +
>  /*
>    Problems & solutions
>    --------------------
> @@ -134,6 +139,172 @@ struct ipgre_net {
>        struct net_device *fb_tunnel_dev;
>  };
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +       /*
> +        * This part of code includes codes to enable L2 ethernet
> +        * switch virtualization over IP routed infrastructure with
> +        * utilization of multicast capable endpoint using Ethernet
> +        * over GRE
> +        *
> +        * Author: Stefan Gula
> +        * Signed-off-by: Stefan Gula <steweg@gmail.com>
> +        */
> +struct ipgre_tap_bridge_entry {
> +       struct hlist_node       hlist;
> +       __be32                  raddr;
> +       unsigned char           addr[ETH_ALEN];
> +       unsigned long           updated;
> +       struct rcu_head         rcu;
> +};
> +
> +static u32 ipgre_salt __read_mostly;
> +
> +static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
> +{
> +       u32 key = get_unaligned((u32 *)(mac + 2));
> +
> +       return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
> +}
> +
> +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
> +                               const struct ipgre_tap_bridge_entry *entry)
> +{
> +       return time_before_eq(entry->updated + tunnel->ageing_time,
> +                               jiffies);
> +}
> +
> +static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry)
> +{
> +       hlist_del_rcu(&entry->hlist);
> +       kfree_rcu(entry, rcu);
> +}
> +
> +static void ipgre_tap_bridge_cleanup(unsigned long _data)
> +{
> +       struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
> +       unsigned long delay = tunnel->ageing_time;
> +       unsigned long next_timer = jiffies + tunnel->ageing_time;
> +       int i;
> +
> +       spin_lock(&tunnel->hash_lock);
> +       for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> +               struct ipgre_tap_bridge_entry *entry;
> +               struct hlist_node *h, *n;
> +
> +               hlist_for_each_entry_safe(entry, h, n,
> +                       &tunnel->hash[i], hlist)
> +               {
> +                       unsigned long this_timer;
> +                       this_timer = entry->updated + delay;
> +                       if (time_before_eq(this_timer, jiffies))
> +                               ipgre_tap_bridge_delete(entry);
> +                       else if (time_before(this_timer, next_timer))
> +                               next_timer = this_timer;
> +               }
> +       }
> +       spin_unlock(&tunnel->hash_lock);
> +       mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));
> +}
> +
> +static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
> +{
> +       int i;
> +
> +       spin_lock_bh(&tunnel->hash_lock);
> +       for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> +               struct ipgre_tap_bridge_entry *entry;
> +               struct hlist_node *h, *n;
> +
> +               hlist_for_each_entry_safe(entry, h, n,
> +                       &tunnel->hash[i], hlist)
> +               {
> +                       ipgre_tap_bridge_delete(entry);
> +               }
> +       }
> +       spin_unlock_bh(&tunnel->hash_lock);
> +}
> +
> +static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(
> +       struct ip_tunnel *tunnel, const unsigned char *addr)
> +{
> +       struct hlist_node *h;
> +       struct ipgre_tap_bridge_entry *entry;
> +
> +       hlist_for_each_entry_rcu(entry, h,
> +                       &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) {
> +               if (!compare_ether_addr(entry->addr, addr)) {
> +                       if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
> +                               entry)))
> +                               break;
> +                       return entry;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
> +       struct hlist_head *head,
> +       const unsigned char *addr)
> +{
> +       struct hlist_node *h;
> +       struct ipgre_tap_bridge_entry *entry;
> +
> +       hlist_for_each_entry(entry, h, head, hlist) {
> +               if (!compare_ether_addr(entry->addr, addr))
> +                       return entry;
> +       }
> +       return NULL;
> +}
> +
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
> +       struct hlist_head *head,
> +       const unsigned char *addr)
> +{
> +       struct hlist_node *h;
> +       struct ipgre_tap_bridge_entry *entry;
> +
> +       hlist_for_each_entry_rcu(entry, h, head, hlist) {
> +               if (!compare_ether_addr(entry->addr, addr))
> +                       return entry;
> +       }
> +       return NULL;
> +}
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
> +       struct hlist_head *head,
> +       __be32 source,
> +       const unsigned char *addr)
> +{
> +       struct ipgre_tap_bridge_entry *entry;
> +
> +       entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
> +       if (entry) {
> +               memcpy(entry->addr, addr, ETH_ALEN);
> +               entry->raddr = source;
> +               entry->updated = jiffies;
> +               hlist_add_head_rcu(&entry->hlist, head);
> +       }
> +       return entry;
> +}
> +
> +static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> +       const unsigned char *addr)
> +{
> +       __be32 raddr = 0;
> +       struct ipgre_tap_bridge_entry *entry;
> +
> +       rcu_read_lock();
> +       entry = __ipgre_tap_bridge_get(tunnel, addr);
> +       if (entry)
> +               raddr = entry->raddr;
> +       rcu_read_unlock();
> +
> +       return raddr;
> +}
> +
> +#endif
>  /* Tunnel hash table */
>
>  /*
> @@ -562,6 +733,12 @@ static int ipgre_rcv(struct sk_buff *skb
>        struct ip_tunnel *tunnel;
>        int    offset = 4;
>        __be16 gre_proto;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +       __be32 orig_source;
> +       struct hlist_head *head;
> +       struct ipgre_tap_bridge_entry *entry;
> +       const struct ethhdr *tethhdr;
> +#endif
>
>        if (!pskb_may_pull(skb, 16))
>                goto drop_nolock;
> @@ -654,6 +831,9 @@ static int ipgre_rcv(struct sk_buff *skb
>
>                /* Warning: All skb pointers will be invalidated! */
>                if (tunnel->dev->type == ARPHRD_ETHER) {
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +                       orig_source = iph->saddr;
> +#endif
>                        if (!pskb_may_pull(skb, ETH_HLEN)) {
>                                tunnel->dev->stats.rx_length_errors++;
>                                tunnel->dev->stats.rx_errors++;
> @@ -663,6 +843,32 @@ static int ipgre_rcv(struct sk_buff *skb
>                        iph = ip_hdr(skb);
>                        skb->protocol = eth_type_trans(skb, tunnel->dev);
>                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +                       if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
> +                               tethhdr = eth_hdr(skb);
> +                               if (!is_multicast_ether_addr(
> +                                       tethhdr->h_source)) {
> +                                       head = &tunnel->hash[
> +                                               ipgre_tap_bridge_hash(
> +                                                       tethhdr->h_source)];
> +                                       entry = ipgre_tap_bridge_find_rcu(head,
> +                                               tethhdr->h_source);
> +                                       if (likely(entry)) {
> +                                               entry->raddr = orig_source;
> +                                               entry->updated = jiffies;
> +                                       } else {
> +                                         spin_lock(&tunnel->hash_lock);
> +                                         if (!ipgre_tap_bridge_find(head,
> +                                               tethhdr->h_source))
> +                                               ipgre_tap_bridge_create(
> +                                                       head,
> +                                                       orig_source,
> +                                                       tethhdr->h_source);
> +                                         spin_unlock(&tunnel->hash_lock);
> +                                       }
> +                               }
> +                       }
> +#endif
>                }
>
>                tstats = this_cpu_ptr(tunnel->dev->tstats);
> @@ -702,7 +908,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str
>        struct iphdr  *iph;                     /* Our new IP header */
>        unsigned int max_headroom;              /* The extra header space needed */
>        int    gre_hlen;
> -       __be32 dst;
> +       __be32 dst = 0;
>        int    mtu;
>
>        if (dev->type == ARPHRD_ETHER)
> @@ -716,7 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str
>                tiph = &tunnel->parms.iph;
>        }
>
> -       if ((dst = tiph->daddr) == 0) {
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +       if ((dev->type == ARPHRD_ETHER) &&
> +               ipv4_is_multicast(tunnel->parms.iph.daddr))
> +               dst = ipgre_tap_bridge_get_raddr(tunnel,
> +                       ((struct ethhdr *)skb->data)->h_dest);
> +#endif
> +       if (dst == 0)
> +               dst = tiph->daddr;
> +       if (dst == 0) {
>                /* NBMA tunnel */
>
>                if (skb_dst(skb) == NULL) {
> @@ -1209,6 +1423,16 @@ static int ipgre_open(struct net_device
>                        return -EADDRNOTAVAIL;
>                t->mlink = dev->ifindex;
>                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +               if (t->dev->type == ARPHRD_ETHER) {
> +                       INIT_HLIST_HEAD(t->hash);
> +                       spin_lock_init(&t->hash_lock);
> +                       t->ageing_time = 300 * HZ;
> +                       setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
> +                               (unsigned long) t);
> +                       mod_timer(&t->gc_timer, jiffies + t->ageing_time);
> +               }
> +#endif
>        }
>        return 0;
>  }
> @@ -1219,6 +1443,12 @@ static int ipgre_close(struct net_device
>
>        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
>                struct in_device *in_dev;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +               if (t->dev->type == ARPHRD_ETHER) {
> +                       ipgre_tap_bridge_flush(t);
> +                       del_timer_sync(&t->gc_timer);
> +               }
> +#endif
>                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
>                if (in_dev)
>                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
> @@ -1488,6 +1718,10 @@ static int ipgre_tap_init(struct net_dev
>  static const struct net_device_ops ipgre_tap_netdev_ops = {
>        .ndo_init               = ipgre_tap_init,
>        .ndo_uninit             = ipgre_tunnel_uninit,
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +       .ndo_open               = ipgre_open,
> +       .ndo_stop               = ipgre_close,
> +#endif
>        .ndo_start_xmit         = ipgre_tunnel_xmit,
>        .ndo_set_mac_address    = eth_mac_addr,
>        .ndo_validate_addr      = eth_validate_addr,
> @@ -1705,6 +1939,9 @@ static int __init ipgre_init(void)
>
>        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +       get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
> +#endif
>        err = register_pernet_device(&ipgre_net_ops);
>        if (err < 0)
>                return err;
is there anything else needed from my side to get this code into the
kernel or should I only wait for the maintainers to check it?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 23, 2012, 2:13 p.m. UTC | #2
Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :

> is there anything else needed from my side to get this code into the
> kernel or should I only wait for the maintainers to check it?

net-next is/was not yet opened, you shall wait for the good time frame.

Dont copy/paste a big message only to add one sentence at its end, its
really a waste of time for many of us.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 23, 2012, 6:43 p.m. UTC | #3
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 23 Jan 2012 15:13:19 +0100

> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :
> 
>> is there anything else needed from my side to get this code into the
>> kernel or should I only wait for the maintainers to check it?
> 
> net-next is/was not yet opened, you shall wait for the good time frame.
> 
> Dont copy/paste a big message only to add one sentence at its end, its
> really a waste of time for many of us.

Also you were told of other ways to achieve the things you want to do,
and I haven't seen any reasonable response that supports still adding
this new code.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 23, 2012, 10:23 p.m. UTC | #4
2012/1/23 David Miller <davem@davemloft.net>:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 23 Jan 2012 15:13:19 +0100
>
>> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :
>>
>>> is there anything else needed from my side to get this code into the
>>> kernel or should I only wait for the maintainers to check it?
>>
>> net-next is/was not yet opened, you shall wait for the good time frame.
>>
>> Dont copy/paste a big message only to add one sentence at its end, its
>> really a waste of time for many of us.
>
> Also you were told of other ways to achieve the things you want to do,
> and I haven't seen any reasonable response that supports still adding
> this new code.
I am sorry about the mistakes I did (still new to kernel submitting
processes). I believe that I also explain to everybody who says
something similar that it is not entirely true and probably never will
be as none of the current SW allows you to create L2 Ethernet
Multipoint VPN solution in such simple way as almost all requires to
run special kind of software in user-space to provide control-plane
for such feature to work. This software must be able to run on various
platforms/architectures which makes it sometimes problematic. My
solution doesn't need this at all and extends the capability of gretap
interfaces in proper manner to be aligned with other forwarding
engines of gretap interfaces. I already have at least one successful
test done in other environment other than my own. So I assume some
people like this approach better than other solutions. For them and
also for myself, I believe it should be added.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joseph Glanville Jan. 25, 2012, 3:48 a.m. UTC | #5
Hi,

I have conducted testing on this patch series and have found it to be stable.
It applies cleanly and doesn't introduce any noticable performance
impact over standard gretap interfaces.

The reason why this patch is useful is that it stands to be the only
true mulitpoint L2 VPN with a kernel space forwarding plane.
Almost all other VPN solutions use the TAP/TUN module - resulting in
barely 300-400mbits of performance even on powerful cores.
Many of these solutions implented in userspace still don't implement
MAC learning or any method of tunneling traffic directly to endpoints
rather than through some central server.
I measured this patch (and gretap in general) peaking at 4.7gbits on
my test hardware (reasonable i5 cores)
The iperf benchmark results are at the end of this email.

The merits of the solution are as follows:
- No userspace utilities required (other than ip)
- Stateless (gre) tunnels dont cause issues like TCP in TCP and are
well supported in hardware (unlike custom UDP encapsulation)
- Forwarding table automatically updated via multicast learning
- If included the Linux kernel is fully capable for being a multisite
Ethernet bridge with learning and unicast tunneling.

In contrast of other software being able to implement this solutoin:
- OpenVSwitch currently doesnt have NVGRE or VXLAN support (I'm
working on getting VXLAN support stable in the 1.4 branch however)
    - as a subpoint you can implement this using an Openflow
controller and OVS gre interfaces but it's well outside the effort
scope this patch is in
- Userspace solutions like OpenVPN that use the TUN interface are very
inefficient for high speed tunneling use and again are hard to
configure in multipoint mode without resuling in a star topology (very
inefficient for high b/w use)
- Other kernelspace tunneling isnt capable of forming a loop free
arbitary topology without STP or other protocol to ensure an acyclic
forwarding graph

Having this in the kernel eliminates the need for much more
complicated userspace utilities and the ip command makes for an
effective user interface.

Lastly here are the promised benchmarks, I can post test rig stats if
anyone is interested.

------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[  4] local 192.168.1.2 port 5001 connected with 192.168.1.1 port 59819
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  5.50 GBytes  4.72 Gbits/sec

Kind regards,
Joseph.

2012/1/24 Štefan Gula <steweg@ynet.sk>:
> 2012/1/23 David Miller <davem@davemloft.net>:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Mon, 23 Jan 2012 15:13:19 +0100
>>
>>> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :
>>>
>>>> is there anything else needed from my side to get this code into the
>>>> kernel or should I only wait for the maintainers to check it?
>>>
>>> net-next is/was not yet opened, you shall wait for the good time frame.
>>>
>>> Dont copy/paste a big message only to add one sentence at its end, its
>>> really a waste of time for many of us.
>>
>> Also you were told of other ways to achieve the things you want to do,
>> and I haven't seen any reasonable response that supports still adding
>> this new code.
> I am sorry about the mistakes I did (still new to kernel submitting
> processes). I believe that I also explain to everybody who says
> something similar that it is not entirely true and probably never will
> be as none of the current SW allows you to create L2 Ethernet
> Multipoint VPN solution in such simple way as almost all requires to
> run special kind of software in user-space to provide control-plane
> for such feature to work. This software must be able to run on various
> platforms/architectures which makes it sometimes problematic. My
> solution doesn't need this at all and extends the capability of gretap
> interfaces in proper manner to be aligned with other forwarding
> engines of gretap interfaces. I already have at least one successful
> test done in other environment other than my own. So I assume some
> people like this approach better than other solutions. For them and
> also for myself, I believe it should be added.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
David Miller Jan. 25, 2012, 4:02 a.m. UTC | #6
From: Joseph Glanville <joseph.glanville@orionvm.com.au>
Date: Wed, 25 Jan 2012 14:48:37 +1100

> The reason why this patch is useful is that it stands to be the only
> true mulitpoint L2 VPN with a kernel space forwarding plane.

So what you're telling me is that I added this huge openvswitch
thing essentially for nothing?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 25, 2012, 5:53 a.m. UTC | #7
Le mercredi 18 janvier 2012 à 00:33 +0100, Stefan Gula a écrit :
> From: Stefan Gula <steweg@gmail.com>
> 
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
> 
> Signed-off-by: Stefan Gula <steweg@gmail.com>
> 
> ---
> 

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

I guess remaining performance point is to get a refcount less
ip_route_output_gre(), aka ip_route_output_gre_noref() :)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Jan. 25, 2012, 7:11 a.m. UTC | #8
On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
> Date: Wed, 25 Jan 2012 14:48:37 +1100
>
>> The reason why this patch is useful is that it stands to be the only
>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>
> So what you're telling me is that I added this huge openvswitch
> thing essentially for nothing?

I think it's actually the opposite - Open vSwitch can be used to
implement this type of thing as well as for many other use cases.  On
the other hand, even when implementing a multipoint L2 solution it can
be useful to have additional levels of control but you can't do that
with this patch because it essentially statically glues together
tunneling and bridging.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 25, 2012, 7:38 a.m. UTC | #9
2012/1/25 Jesse Gross <jesse@nicira.com>:
> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
>
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases.  On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.
Yes, those methods are glued together. If you are speaking about
additional level of controls. What kind of control is missing?

As if I compare it to standard bridge, it is missing:
-  STP code, which is not relevant here as the topology inside the
gretap bridge never reaches loops - it represent more one shared link
than box with multiple links from STP point of view. On the other hand
STP can be tunneled inside of that tunnel by putting gretap interface
as part of some bridge e.g. "brctl addif br0 gretap0".
- IGMP/MLD snooping. IGMP/MLD snooping are useful features, but due
the encapsulation rules, the only one optimalization can be done and
that's if in ONLY ONE gretap enabled nodes requires to join some
multicast group inside the gretap and one node has source behind. In
that case those frames can be forwarded to only that gretap node. In
case of two or more, the encapsulation process will result in using
multicast as underlying technology so any one of the gretap nodes will
received the frames regardless of state if IGMP/MLD. On the other hand
such multicast optimalizations are missing from whole GRE tunnels code
(PIM/MLD/IGMP snooping, using something like cisco Multicast Domain
Trees/MDT....), so if somebody wants to optimize that feel free, but
don't blame this patch for missing those.
- did I miss something?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 25, 2012, 8:37 a.m. UTC | #10
Le mardi 24 janvier 2012 à 23:11 -0800, Jesse Gross a écrit :

> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases.  On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.

Unless you can provide a working solution in a very short time, this
patch is a pragmatic one.

Code is not perfect and could be improved (for example using a helper
function to keep ipgre_rcv() shorter and reduce indentation level)

Stefan, could you move this code out of ipgre_rcv() ?


+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                       if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+                               tethhdr = eth_hdr(skb);
+                               if (!is_multicast_ether_addr(
+                                       tethhdr->h_source)) {
+                                       head = &tunnel->hash[
+                                               ipgre_tap_bridge_hash(
+                                                       tethhdr->h_source)];
+                                       entry = ipgre_tap_bridge_find_rcu(head,
+                                               tethhdr->h_source);
+                                       if (likely(entry)) {
+                                               entry->raddr = orig_source;
+                                               entry->updated = jiffies;
+                                       } else {
+                                         spin_lock(&tunnel->hash_lock);
+                                         if (!ipgre_tap_bridge_find(head,
+                                               tethhdr->h_source))
+                                               ipgre_tap_bridge_create(
+                                                       head,
+                                                       orig_source,
+                                                       tethhdr->h_source);
+                                         spin_unlock(&tunnel->hash_lock);
+                                       }
+                               }
+                       }
+#endif



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 25, 2012, 9:17 a.m. UTC | #11
2012/1/25 Eric Dumazet <eric.dumazet@gmail.com>:
> Le mardi 24 janvier 2012 à 23:11 -0800, Jesse Gross a écrit :
>
>> I think it's actually the opposite - Open vSwitch can be used to
>> implement this type of thing as well as for many other use cases.  On
>> the other hand, even when implementing a multipoint L2 solution it can
>> be useful to have additional levels of control but you can't do that
>> with this patch because it essentially statically glues together
>> tunneling and bridging.
>
> Unless you can provide a working solution in a very short time, this
> patch is a pragmatic one.
>
> Code is not perfect and could be improved (for example using a helper
> function to keep ipgre_rcv() shorter and reduce indentation level)
>
> Stefan, could you move this code out of ipgre_rcv() ?
>
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +                       if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
> +                               tethhdr = eth_hdr(skb);
> +                               if (!is_multicast_ether_addr(
> +                                       tethhdr->h_source)) {
> +                                       head = &tunnel->hash[
> +                                               ipgre_tap_bridge_hash(
> +                                                       tethhdr->h_source)];
> +                                       entry = ipgre_tap_bridge_find_rcu(head,
> +                                               tethhdr->h_source);
> +                                       if (likely(entry)) {
> +                                               entry->raddr = orig_source;
> +                                               entry->updated = jiffies;
> +                                       } else {
> +                                         spin_lock(&tunnel->hash_lock);
> +                                         if (!ipgre_tap_bridge_find(head,
> +                                               tethhdr->h_source))
> +                                               ipgre_tap_bridge_create(
> +                                                       head,
> +                                                       orig_source,
> +                                                       tethhdr->h_source);
> +                                         spin_unlock(&tunnel->hash_lock);
> +                                       }
> +                               }
> +                       }
> +#endif
>
>
>
Ok
Joseph Glanville Jan. 25, 2012, 3:25 p.m. UTC | #12
Hi,

Sorry David, I think I might have failed to convey my meaning.

What I meant by only true L2 multipoint VPN is that Open vSwitch
cannot fuffil this on it's own at this time. Without NVGRE or VXLAN
support its required to implement some logic in a Openflow controller
to achieve this functionality. However once NVGRE/VXLAN is complete
advanced users can setup a VXLAN domain to accomplish this.

Open vSwitch fufils a different role to this simplistic VPN. The point
of OVS is to be a generic L2 forwarding plane for Software Defined
Networking (SDN) solutions, an effective cornerstone of network
virtualisation.
If anything it's more analgous to Solaris's Crossbow or FreeBSD's
VIMAGE in that it's more of a wholistic solution to the problem
cappable of solving arbitarily difficult fowarding problems with the
assistance of userspace logic.
As such given a controller that implemented the tunnel endpoint
learning, establishment of tunnels between every endpoint and a
broadcast tunnel you could build a solution that is on par with this
patch if not better.

The 2 are fundamentally different usecases however - many people don't
need or couldn't care less about SDN and just want a mulitpoint VPN or
L3 encapsulation of L2 between bridges, this is an ideal solution for
these simpler usecases.

The primary users of Open vSwitch are likely to be large virtual
environments with complex network topologies and well defined virtual
networking needs.
For this class of users the Linux Bridge module + static tunneling
isn't going to cut it without some very clever control software -
which is why OVS exists.

I think there is quite some way to go yet integrating OVS into the
kernel and making use of all of it's advanced features but I would say
it's far from useless and that in the long run it's inclusion will be
very rewarding.

Kind regards,
Joseph.

On 25 January 2012 18:11, Jesse Gross <jesse@nicira.com> wrote:
> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
>
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases.  On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.
stephen hemminger Jan. 25, 2012, 4:57 p.m. UTC | #13
On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
Stefan Gula <steweg@ynet.sk> wrote:

> From: Stefan Gula <steweg@gmail.com>
> 
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
> 
> Signed-off-by: Stefan Gula <steweg@gmail.com>

Will this break normal usages of GRE?
Compile time options are not acceptable for a standard distribution if
it will break it for the other case. It is fine to add the additional data
structure elements, but the code in the receive path might get broken?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 25, 2012, 9:01 p.m. UTC | #14
2012/1/25 Stephen Hemminger <shemminger@vyatta.com>:
> On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
> Stefan Gula <steweg@ynet.sk> wrote:
>
>> From: Stefan Gula <steweg@gmail.com>
>>
>> This patch is an extension for current Ethernet over GRE
>> implementation, which allows user to create virtual bridge (multipoint
>> VPN) and forward traffic based on Ethernet MAC address information in
>> it. It simulates the Bridge behavior learning mechanism, but instead
>> of learning port ID from which given MAC address comes, it learns IP
>> address of peer which encapsulated given packet. Multicast, Broadcast
>> and unknown-multicast traffic is send over network as multicast
>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> represented as one single virtual switch on logical level and be also
>> represented as one multicast IPv4 address on network level.
>>
>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>
> Will this break normal usages of GRE?
> Compile time options are not acceptable for a standard distribution if
> it will break it for the other case. It is fine to add the additional data
> structure elements, but the code in the receive path might get broken?
>
>
No, if you don't decide to compile with given kernel option.. nothing
happens and original GRE code is leaved intact.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 25, 2012, 9:49 p.m. UTC | #15
From: Štefan Gula <steweg@ynet.sk>
Date: Wed, 25 Jan 2012 22:01:08 +0100

> 2012/1/25 Stephen Hemminger <shemminger@vyatta.com>:
>> On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
>> Stefan Gula <steweg@ynet.sk> wrote:
>>
>>> From: Stefan Gula <steweg@gmail.com>
>>>
>>> This patch is an extension for current Ethernet over GRE
>>> implementation, which allows user to create virtual bridge (multipoint
>>> VPN) and forward traffic based on Ethernet MAC address information in
>>> it. It simulates the Bridge behavior learning mechanism, but instead
>>> of learning port ID from which given MAC address comes, it learns IP
>>> address of peer which encapsulated given packet. Multicast, Broadcast
>>> and unknown-multicast traffic is send over network as multicast
>>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>>> represented as one single virtual switch on logical level and be also
>>> represented as one multicast IPv4 address on network level.
>>>
>>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>>
>> Will this break normal usages of GRE?
>> Compile time options are not acceptable for a standard distribution if
>> it will break it for the other case. It is fine to add the additional data
>> structure elements, but the code in the receive path might get broken?
>>
>>
> No, if you don't decide to compile with given kernel option.. nothing
> happens and original GRE code is leaved intact.

Every distribution is going to want to turn the option on to provide
the feature to their users, so this line of reasoning is not
acceptable.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 25, 2012, 9:55 p.m. UTC | #16
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 24 Jan 2012 23:11:06 -0800

> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
> 
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases.

Then openvswitch is exactly where you should be prototyping and
implementing support for this sort of stuff.

And only if you cannot obtain reasonable performance using openvswitch
should you be even entertaining the notion of a static implementation.

That's the whole premise behind putting openvswitch into the tree, so
that guys like you can play around in userspace without having to make
any kernel changes at all.

I am not applying these patches, the more things you say the more I am
convinced they are not appropriate.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 25, 2012, 10:30 p.m. UTC | #17
2012/1/25 David Miller <davem@davemloft.net>:
> From: Štefan Gula <steweg@ynet.sk>
> Date: Wed, 25 Jan 2012 22:01:08 +0100
>
>> 2012/1/25 Stephen Hemminger <shemminger@vyatta.com>:
>>> On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
>>> Stefan Gula <steweg@ynet.sk> wrote:
>>>
>>>> From: Stefan Gula <steweg@gmail.com>
>>>>
>>>> This patch is an extension for current Ethernet over GRE
>>>> implementation, which allows user to create virtual bridge (multipoint
>>>> VPN) and forward traffic based on Ethernet MAC address information in
>>>> it. It simulates the Bridge behavior learning mechanism, but instead
>>>> of learning port ID from which given MAC address comes, it learns IP
>>>> address of peer which encapsulated given packet. Multicast, Broadcast
>>>> and unknown-multicast traffic is send over network as multicast
>>>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>>>> represented as one single virtual switch on logical level and be also
>>>> represented as one multicast IPv4 address on network level.
>>>>
>>>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>>>
>>> Will this break normal usages of GRE?
>>> Compile time options are not acceptable for a standard distribution if
>>> it will break it for the other case. It is fine to add the additional data
>>> structure elements, but the code in the receive path might get broken?
>>>
>>>
>> No, if you don't decide to compile with given kernel option.. nothing
>> happens and original GRE code is leaved intact.
>
> Every distribution is going to want to turn the option on to provide
> the feature to their users, so this line of reasoning is not
> acceptable.
On the other hand, if you enable the compile this option, it modifies
the behavior only when you are using this command to create the
interface:
ip link add vpn0 type gretap local 1.2.3.4 remote 239.192.0.1 key 12345 ttl 10
please notice two major keywords:
 - gretap
 - 239.192.0.1
My patch modifies the code behavior only if you used "gretap" and
remote IP must be valid multicast IP address. In every other
combinations it is still using plain original GRE codes. So enabling
it doesn't breaks anything else.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 25, 2012, 10:57 p.m. UTC | #18
2012/1/25 David Miller <davem@davemloft.net>:
> From: Jesse Gross <jesse@nicira.com>
> Date: Tue, 24 Jan 2012 23:11:06 -0800
>
>> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
>>> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
>>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>>
>>>> The reason why this patch is useful is that it stands to be the only
>>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>>
>>> So what you're telling me is that I added this huge openvswitch
>>> thing essentially for nothing?
>>
>> I think it's actually the opposite - Open vSwitch can be used to
>> implement this type of thing as well as for many other use cases.
>
> Then openvswitch is exactly where you should be prototyping and
> implementing support for this sort of stuff.
>
> And only if you cannot obtain reasonable performance using openvswitch
> should you be even entertaining the notion of a static implementation.
>
> That's the whole premise behind putting openvswitch into the tree, so
> that guys like you can play around in userspace without having to make
> any kernel changes at all.
>
> I am not applying these patches, the more things you say the more I am
> convinced they are not appropriate.
>
The performance is one of the most critical thing why I have chosen to
build kernel patch in the first place instead of some user-space app.
If I used this approach, I would probably end up with patch for
OpenVPN project instead in that time. I am not telling that
openvswitch is not a good place for prototyping, but I believe that
this patch is beyond that border as it successfully run in environment
with more 98 linux-based APs, used for 4K+ users, with no issue for
more than 2 years. The performance results from Joseph Glanville even
adds value to it. So I still don't get the point, why my patch and
openvswitch cannot coexists in the kernel together and let user/admin
to choose to correct solution for him/her.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Taht Jan. 25, 2012, 11:18 p.m. UTC | #19
2012/1/25 Štefan Gula <steweg@ynet.sk>:
> 2012/1/25 David Miller <davem@davemloft.net>:
>> From: Jesse Gross <jesse@nicira.com>
>> Date: Tue, 24 Jan 2012 23:11:06 -0800
>>
>>> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote:
>>>> From: Joseph Glanville <joseph.glanville@orionvm.com.au>
>>>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>>>
>>>>> The reason why this patch is useful is that it stands to be the only
>>>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>>>
>>>> So what you're telling me is that I added this huge openvswitch
>>>> thing essentially for nothing?
>>>
>>> I think it's actually the opposite - Open vSwitch can be used to
>>> implement this type of thing as well as for many other use cases.
>>
>> Then openvswitch is exactly where you should be prototyping and
>> implementing support for this sort of stuff.
>>
>> And only if you cannot obtain reasonable performance using openvswitch
>> should you be even entertaining the notion of a static implementation.
>>
>> That's the whole premise behind putting openvswitch into the tree, so
>> that guys like you can play around in userspace without having to make
>> any kernel changes at all.
>>
>> I am not applying these patches, the more things you say the more I am
>> convinced they are not appropriate.
>>
> The performance is one of the most critical thing why I have chosen to
> build kernel patch in the first place instead of some user-space app.

I am very interested in testing this patch, for the kinds of environments
I care about (embedded, cpe, etc)

but I won't be able to get around to it for a week or so.

I found the overall simplicity of this approach vs the
complexity of the alternatives, appealing, and the
performance numbers also.
David Miller Jan. 26, 2012, 1:37 a.m. UTC | #20
From: Štefan Gula <steweg@ynet.sk>
Date: Wed, 25 Jan 2012 23:57:18 +0100

> The performance is one of the most critical thing why I have chosen to
> build kernel patch in the first place instead of some user-space app.
> If I used this approach, I would probably end up with patch for
> OpenVPN project instead in that time. I am not telling that
> openvswitch is not a good place for prototyping, but I believe that
> this patch is beyond that border as it successfully run in environment
> with more 98 linux-based APs, used for 4K+ users, with no issue for
> more than 2 years. The performance results from Joseph Glanville even
> adds value to it. So I still don't get the point, why my patch and
> openvswitch cannot coexists in the kernel together and let user/admin
> to choose to correct solution for him/her.

You don't even know if openvswitch could provide acceptable levels
of performance, because you haven't even tried.

I'm not applying your patch.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Štefan Gula Jan. 26, 2012, 10:57 a.m. UTC | #21
2012/1/26 David Miller <davem@davemloft.net>:
> From: Štefan Gula <steweg@ynet.sk>
> Date: Wed, 25 Jan 2012 23:57:18 +0100
>
>> The performance is one of the most critical thing why I have chosen to
>> build kernel patch in the first place instead of some user-space app.
>> If I used this approach, I would probably end up with patch for
>> OpenVPN project instead in that time. I am not telling that
>> openvswitch is not a good place for prototyping, but I believe that
>> this patch is beyond that border as it successfully run in environment
>> with more 98 linux-based APs, used for 4K+ users, with no issue for
>> more than 2 years. The performance results from Joseph Glanville even
>> adds value to it. So I still don't get the point, why my patch and
>> openvswitch cannot coexists in the kernel together and let user/admin
>> to choose to correct solution for him/her.
>
> You don't even know if openvswitch could provide acceptable levels
> of performance, because you haven't even tried.
>
> I'm not applying your patch.
Performance of any user-space application is lower than performance of
something running purely inside the kernel-space only. So still don't
see any valid reason, why it simply cannot coexists as it doesn't
breaks any existing functionality at all?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 26, 2012, 6:30 p.m. UTC | #22
From: Štefan Gula <steweg@ynet.sk>
Date: Thu, 26 Jan 2012 11:57:30 +0100

> 2012/1/26 David Miller <davem@davemloft.net>:
>> From: Štefan Gula <steweg@ynet.sk>
>> Date: Wed, 25 Jan 2012 23:57:18 +0100
>>
>>> The performance is one of the most critical thing why I have chosen to
>>> build kernel patch in the first place instead of some user-space app.
>>> If I used this approach, I would probably end up with patch for
>>> OpenVPN project instead in that time. I am not telling that
>>> openvswitch is not a good place for prototyping, but I believe that
>>> this patch is beyond that border as it successfully run in environment
>>> with more 98 linux-based APs, used for 4K+ users, with no issue for
>>> more than 2 years. The performance results from Joseph Glanville even
>>> adds value to it. So I still don't get the point, why my patch and
>>> openvswitch cannot coexists in the kernel together and let user/admin
>>> to choose to correct solution for him/her.
>>
>> You don't even know if openvswitch could provide acceptable levels
>> of performance, because you haven't even tried.
>>
>> I'm not applying your patch.
> Performance of any user-space application is lower than performance of
> something running purely inside the kernel-space only. So still don't
> see any valid reason, why it simply cannot coexists as it doesn't
> breaks any existing functionality at all?

The only userspace component is setting up the rules, the actual
packet processing occurs in the openvswitch kernel code.

Are you really unable to understand this?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joseph Glanville Jan. 26, 2012, 10:24 p.m. UTC | #23
David is correct, the forwarding speed of Open vSwitch is at parity
with the Linux Bridging module and its tunneling speed is actually
slightly faster than the in kernel GRE implementation. I have tested
this across a variety of configurations.

Joseph.

On 27 January 2012 05:30, David Miller <davem@davemloft.net> wrote:
> From: Štefan Gula <steweg@ynet.sk>
> Date: Thu, 26 Jan 2012 11:57:30 +0100
>
>> 2012/1/26 David Miller <davem@davemloft.net>:
>>> From: Štefan Gula <steweg@ynet.sk>
>>> Date: Wed, 25 Jan 2012 23:57:18 +0100
>>>
>>>> The performance is one of the most critical thing why I have chosen to
>>>> build kernel patch in the first place instead of some user-space app.
>>>> If I used this approach, I would probably end up with patch for
>>>> OpenVPN project instead in that time. I am not telling that
>>>> openvswitch is not a good place for prototyping, but I believe that
>>>> this patch is beyond that border as it successfully run in environment
>>>> with more 98 linux-based APs, used for 4K+ users, with no issue for
>>>> more than 2 years. The performance results from Joseph Glanville even
>>>> adds value to it. So I still don't get the point, why my patch and
>>>> openvswitch cannot coexists in the kernel together and let user/admin
>>>> to choose to correct solution for him/her.
>>>
>>> You don't even know if openvswitch could provide acceptable levels
>>> of performance, because you haven't even tried.
>>>
>>> I'm not applying your patch.
>> Performance of any user-space application is lower than performance of
>> something running purely inside the kernel-space only. So still don't
>> see any valid reason, why it simply cannot coexists as it doesn't
>> breaks any existing functionality at all?
>
> The only userspace component is setting up the rules, the actual
> packet processing occurs in the openvswitch kernel code.
>
> Are you really unable to understand this?
Eric Dumazet Jan. 27, 2012, 5:59 a.m. UTC | #24
Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
> David is correct, the forwarding speed of Open vSwitch is at parity
> with the Linux Bridging module and its tunneling speed is actually
> slightly faster than the in kernel GRE implementation. I have tested
> this across a variety of configurations.

Thanks for this input ! When was this tested exactly, and do you have
some "perf tool" reports to provide ?

GRE is lockless since one year or so (modulo how is setup the tunnel as
discovered recently)

Also please note that openvSwitch is said to be fast path only on
established flows.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joseph Glanville Jan. 27, 2012, 10:54 a.m. UTC | #25
Hi Eric,

I have been testing on the 3.2.X series of kernels and the 3.0.X and
3.1.X prior to that but most of my results are for 3.2.1.
My test hardware includes pairs of hardware (two or more of each)
based on the following chips.

AMD 6140
Intel X5650
Intel Core i5 2400 (desktop)

I conducted throughput and PPS benchmarks using iperf and pktgen.
All tests were performed over a real IP network (IP over Infiniband)
that can operate at a much greater speed than the software is capable
of forwarding at.
If the list is interested I will re-run and post all of my benchmarks
or atleast send them to you personally.
Unfortunately I didn't store the benchmarks (much to my own stupidty)
and I repurposed the hardware for other things.

Yes I was going to mention this in my last email but deleted it on
lack of relevance at the time.
Under pathological load OVS suffers in benchmarks, continual
establishment of new flows is really not good for it - I haven't
observed this personally though.
It does however worry me that this could be used as a viable DoS.. I
don't really know what could be done to mitigate this however.
That aside it's GRE implementation using loopback (internal)
interfaces performs very well and is like I said onpar with Linux
Bridge + GRE.

Are the any specific things you would like to see?
I'm not a networking guru and would welcome any assistance you could
provide on improving my test methodology.
Someone suggested netperf to me the other day, I intend on running
some benchmarks with it next week.

Joseph.

On 27 January 2012 16:59, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
>> David is correct, the forwarding speed of Open vSwitch is at parity
>> with the Linux Bridging module and its tunneling speed is actually
>> slightly faster than the in kernel GRE implementation. I have tested
>> this across a variety of configurations.
>
> Thanks for this input ! When was this tested exactly, and do you have
> some "perf tool" reports to provide ?
>
> GRE is lockless since one year or so (modulo how is setup the tunnel as
> discovered recently)
>
> Also please note that openvSwitch is said to be fast path only on
> established flows.
>
>
>
Jesse Gross Jan. 27, 2012, 9:50 p.m. UTC | #26
On Thu, Jan 26, 2012 at 9:59 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
>> David is correct, the forwarding speed of Open vSwitch is at parity
>> with the Linux Bridging module and its tunneling speed is actually
>> slightly faster than the in kernel GRE implementation. I have tested
>> this across a variety of configurations.
>
> Thanks for this input ! When was this tested exactly, and do you have
> some "perf tool" reports to provide ?
>
> GRE is lockless since one year or so (modulo how is setup the tunnel as
> discovered recently)

The out-of-tree OVS GRE stack (which is separate for practical and
historical reasons but I would like to combine with the in-kernel one
in the future) uses a few dirty tricks to help performance.  I don't
know the parameters that Joseph used to test but one that he might be
benefiting from here is where we cache the full set of headers to be
pushed (GRE/IP/Ethernet) as well any routing/switching decisions.
This turns a GRE encapsulation into a simple memcpy if there is enough
headroom.  It's not something that I would want to propose for more
general usage but I think that GSO/GRO for GRE would get at least some
of that benefit and would also make things easier when NICs with
support for encapsulated offloads start showing up.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Jan. 27, 2012, 9:55 p.m. UTC | #27
On Fri, Jan 27, 2012 at 2:54 AM, Joseph Glanville
<joseph.glanville@orionvm.com.au> wrote:
> Under pathological load OVS suffers in benchmarks, continual
> establishment of new flows is really not good for it - I haven't
> observed this personally though.
> It does however worry me that this could be used as a viable DoS.. I
> don't really know what could be done to mitigate this however.

OVS allows each input port to have a separate genl socket associated
with it for the purpose of sending flow misses to userspace.
Currently userspace uses this to round-robin around these sockets when
handling flow setup requests in order to prevent one port from DoSing
the others.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/include/net/ipip.h	2012-01-16 11:17:01.000000000 +0100
@@ -27,6 +27,14 @@  struct ip_tunnel {
 	__u32			o_seqno;	/* The last output seqno */
 	int			hlen;		/* Precalculated GRE header length */
 	int			mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+	struct hlist_head	hash[GRETAP_BR_HASH_SIZE];
+	spinlock_t		hash_lock;
+	unsigned long		ageing_time;
+	struct timer_list	gc_timer;
+#endif
 
 	struct ip_tunnel_parm	parms;
 
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/Kconfig	2012-01-16 12:37:00.000000000 +0100
@@ -211,6 +211,15 @@  config NET_IPGRE_BROADCAST
 	  Network), but can be distributed all over the Internet. If you want
 	  to do that, say Y here and to "IP multicast routing" below.
 
+config NET_IPGRE_BRIDGE
+	bool "IP: Ethernet over multipoint GRE over IP"
+	depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+	help
+	  Allows you to use multipoint GRE VPN as virtual switch and interconnect
+	  several L2 endpoints over L3 routed infrastructure. It is useful for
+	  creating multipoint L2 VPNs which can be later used inside bridge
+	  interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
 config IP_MROUTE
 	bool "IP: multicast routing"
 	depends on IP_MULTICAST
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/ip_gre.c	2012-01-18 00:33:55.000000000 +0100
@@ -52,6 +52,11 @@ 
 #include <net/ip6_route.h>
 #endif
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
 /*
    Problems & solutions
    --------------------
@@ -134,6 +139,172 @@  struct ipgre_net {
 	struct net_device *fb_tunnel_dev;
 };
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	/*
+	 * This part of code includes codes to enable L2 ethernet
+	 * switch virtualization over IP routed infrastructure with
+	 * utilization of multicast capable endpoint using Ethernet
+	 * over GRE
+	 *
+	 * Author: Stefan Gula
+	 * Signed-off-by: Stefan Gula <steweg@gmail.com>
+	 */
+struct ipgre_tap_bridge_entry {
+	struct hlist_node	hlist;
+	__be32			raddr;
+	unsigned char		addr[ETH_ALEN];
+	unsigned long		updated;
+	struct rcu_head		rcu;
+};
+
+static u32 ipgre_salt __read_mostly;
+
+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+	u32 key = get_unaligned((u32 *)(mac + 2));
+
+	return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+				const struct ipgre_tap_bridge_entry *entry)
+{
+	return time_before_eq(entry->updated + tunnel->ageing_time,
+				jiffies);
+}
+
+static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry)
+{
+	hlist_del_rcu(&entry->hlist);
+	kfree_rcu(entry, rcu);
+}
+
+static void ipgre_tap_bridge_cleanup(unsigned long _data)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+	unsigned long delay = tunnel->ageing_time;
+	unsigned long next_timer = jiffies + tunnel->ageing_time;
+	int i;
+
+	spin_lock(&tunnel->hash_lock);
+	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+		struct ipgre_tap_bridge_entry *entry;
+		struct hlist_node *h, *n;
+
+		hlist_for_each_entry_safe(entry, h, n,
+			&tunnel->hash[i], hlist)
+		{
+			unsigned long this_timer;
+			this_timer = entry->updated + delay;
+			if (time_before_eq(this_timer, jiffies))
+				ipgre_tap_bridge_delete(entry);
+			else if (time_before(this_timer, next_timer))
+				next_timer = this_timer;
+		}
+	}
+	spin_unlock(&tunnel->hash_lock);
+	mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));
+}
+
+static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
+{
+	int i;
+
+	spin_lock_bh(&tunnel->hash_lock);
+	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+		struct ipgre_tap_bridge_entry *entry;
+		struct hlist_node *h, *n;
+
+		hlist_for_each_entry_safe(entry, h, n,
+			&tunnel->hash[i], hlist)
+		{
+			ipgre_tap_bridge_delete(entry);
+		}
+	}
+	spin_unlock_bh(&tunnel->hash_lock);
+}
+
+static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(
+	struct ip_tunnel *tunnel, const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct ipgre_tap_bridge_entry *entry;
+
+	hlist_for_each_entry_rcu(entry, h,
+			&tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) {
+		if (!compare_ether_addr(entry->addr, addr)) {
+			if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+				entry)))
+				break;
+			return entry;
+		}
+	}
+
+	return NULL;
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+	struct hlist_head *head,
+	const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct ipgre_tap_bridge_entry *entry;
+
+	hlist_for_each_entry(entry, h, head, hlist) {
+		if (!compare_ether_addr(entry->addr, addr))
+			return entry;
+	}
+	return NULL;
+}
+
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
+	struct hlist_head *head,
+	const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct ipgre_tap_bridge_entry *entry;
+
+	hlist_for_each_entry_rcu(entry, h, head, hlist) {
+		if (!compare_ether_addr(entry->addr, addr))
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+	struct hlist_head *head,
+	__be32 source,
+	const unsigned char *addr)
+{
+	struct ipgre_tap_bridge_entry *entry;
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry) {
+		memcpy(entry->addr, addr, ETH_ALEN);
+		entry->raddr = source;
+		entry->updated = jiffies;
+		hlist_add_head_rcu(&entry->hlist, head);
+	}
+	return entry;
+}
+
+static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+	const unsigned char *addr)
+{
+	__be32 raddr = 0;
+	struct ipgre_tap_bridge_entry *entry;
+
+	rcu_read_lock();
+	entry = __ipgre_tap_bridge_get(tunnel, addr);
+	if (entry)
+		raddr = entry->raddr;
+	rcu_read_unlock();
+
+	return raddr;
+}
+
+#endif
 /* Tunnel hash table */
 
 /*
@@ -562,6 +733,12 @@  static int ipgre_rcv(struct sk_buff *skb
 	struct ip_tunnel *tunnel;
 	int    offset = 4;
 	__be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	__be32 orig_source;
+	struct hlist_head *head;
+	struct ipgre_tap_bridge_entry *entry;
+	const struct ethhdr *tethhdr;
+#endif
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop_nolock;
@@ -654,6 +831,9 @@  static int ipgre_rcv(struct sk_buff *skb
 
 		/* Warning: All skb pointers will be invalidated! */
 		if (tunnel->dev->type == ARPHRD_ETHER) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+			orig_source = iph->saddr;
+#endif
 			if (!pskb_may_pull(skb, ETH_HLEN)) {
 				tunnel->dev->stats.rx_length_errors++;
 				tunnel->dev->stats.rx_errors++;
@@ -663,6 +843,32 @@  static int ipgre_rcv(struct sk_buff *skb
 			iph = ip_hdr(skb);
 			skb->protocol = eth_type_trans(skb, tunnel->dev);
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+			if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+				tethhdr = eth_hdr(skb);
+				if (!is_multicast_ether_addr(
+					tethhdr->h_source)) {
+					head = &tunnel->hash[
+						ipgre_tap_bridge_hash(
+							tethhdr->h_source)];
+					entry = ipgre_tap_bridge_find_rcu(head,
+						tethhdr->h_source);
+					if (likely(entry)) {
+						entry->raddr = orig_source;
+						entry->updated = jiffies;
+					} else {
+					  spin_lock(&tunnel->hash_lock);
+					  if (!ipgre_tap_bridge_find(head,
+						tethhdr->h_source))
+						ipgre_tap_bridge_create(
+							head,
+							orig_source,
+							tethhdr->h_source);
+					  spin_unlock(&tunnel->hash_lock);
+					}
+				}
+			}
+#endif
 		}
 
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -702,7 +908,7 @@  static netdev_tx_t ipgre_tunnel_xmit(str
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    gre_hlen;
-	__be32 dst;
+	__be32 dst = 0;
 	int    mtu;
 
 	if (dev->type == ARPHRD_ETHER)
@@ -716,7 +922,15 @@  static netdev_tx_t ipgre_tunnel_xmit(str
 		tiph = &tunnel->parms.iph;
 	}
 
-	if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	if ((dev->type == ARPHRD_ETHER) &&
+		ipv4_is_multicast(tunnel->parms.iph.daddr))
+		dst = ipgre_tap_bridge_get_raddr(tunnel,
+			((struct ethhdr *)skb->data)->h_dest);
+#endif
+	if (dst == 0)
+		dst = tiph->daddr;
+	if (dst == 0) {
 		/* NBMA tunnel */
 
 		if (skb_dst(skb) == NULL) {
@@ -1209,6 +1423,16 @@  static int ipgre_open(struct net_device 
 			return -EADDRNOTAVAIL;
 		t->mlink = dev->ifindex;
 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+		if (t->dev->type == ARPHRD_ETHER) {
+			INIT_HLIST_HEAD(t->hash);
+			spin_lock_init(&t->hash_lock);
+			t->ageing_time = 300 * HZ;
+			setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+				(unsigned long) t);
+			mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+		}
+#endif
 	}
 	return 0;
 }
@@ -1219,6 +1443,12 @@  static int ipgre_close(struct net_device
 
 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 		struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+		if (t->dev->type == ARPHRD_ETHER) {
+			ipgre_tap_bridge_flush(t);
+			del_timer_sync(&t->gc_timer);
+		}
+#endif
 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
 		if (in_dev)
 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
@@ -1488,6 +1718,10 @@  static int ipgre_tap_init(struct net_dev
 static const struct net_device_ops ipgre_tap_netdev_ops = {
 	.ndo_init		= ipgre_tap_init,
 	.ndo_uninit		= ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	.ndo_open		= ipgre_open,
+	.ndo_stop		= ipgre_close,
+#endif
 	.ndo_start_xmit		= ipgre_tunnel_xmit,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -1705,6 +1939,9 @@  static int __init ipgre_init(void)
 
 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
 	err = register_pernet_device(&ipgre_net_ops);
 	if (err < 0)
 		return err;