diff mbox

[v3,kernel,version,3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP

Message ID 31531575.2671326838849345.JavaMail.root@5-MeO-DMT.ynet.sk
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Štefan Gula Jan. 17, 2012, 10:20 p.m. UTC
From: Stefan Gula <steweg@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <steweg@gmail.com>

---

code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet Jan. 17, 2012, 11:17 p.m. UTC | #1
Le mardi 17 janvier 2012 à 23:20 +0100, Stefan Gula a écrit :
> From: Stefan Gula <steweg@gmail.com>
> 
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
> 
> Signed-off-by: Stefan Gula <steweg@gmail.com>
> 
> ---
> 
> code was merged with Eric Dumazet proposal (all except the reordering
> of orig_source as that needed to be previous value), tested and fixed
> with additional lines in ipgre_tap_netdev_ops struct
> 

Sorry, this is buggy (again...)

Its even clearly commented in the code :

/* Warning: All skb pointers will be invalidated! */

>  
>          if (!pskb_may_pull(skb, 16))
>                  goto drop_nolock;
> @@ -659,10 +836,38 @@ static int ipgre_rcv(struct sk_buff *skb
>                                  tunnel->dev->stats.rx_errors++;
>                                  goto drop;
>                          }

At this point, iph can point to freed memory and its dereference can
crash, since pskb_may_pull() can reallocate skb head.

> -
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +                        orig_source = iph->saddr;
> +#endif

Without any doubt, you know here there is a bug.

>                          iph = ip_hdr(skb);
>                          skb->protocol = eth_type_trans(skb, tunnel->dev);
>                          skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);


So if you need orig_source as the previous iph->saddr value, you must
fetch it _before_ the pskb_may_pull()


/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
#ifdef CONFIG_NET_IPGRE_BRIDGE
 	orig_source = iph->saddr; /* must be done before pskb_may_pull() */
#endif
	if (!pskb_may_pull(skb, ETH_HLEN)) {
		tunnel->dev->stats.rx_length_errors++;
		tunnel->dev->stats.rx_errors++;
		goto drop;
	}

	iph = ip_hdr(skb);
	...



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h        2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/include/net/ipip.h        2012-01-16 11:17:01.000000000 +0100
@@ -27,6 +27,14 @@  struct ip_tunnel {
         __u32                        o_seqno;        /* The last output seqno */
         int                        hlen;                /* Precalculated GRE header length */
         int                        mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+        struct hlist_head        hash[GRETAP_BR_HASH_SIZE];
+        spinlock_t                hash_lock;
+        unsigned long                ageing_time;
+        struct timer_list        gc_timer;
+#endif
 
         struct ip_tunnel_parm        parms;
 
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig        2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/Kconfig        2012-01-16 12:37:00.000000000 +0100
@@ -211,6 +211,15 @@  config NET_IPGRE_BROADCAST
           Network), but can be distributed all over the Internet. If you want
           to do that, say Y here and to "IP multicast routing" below.
 
+config NET_IPGRE_BRIDGE
+        bool "IP: Ethernet over multipoint GRE over IP"
+        depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+        help
+          Allows you to use multipoint GRE VPN as virtual switch and interconnect
+          several L2 endpoints over L3 routed infrastructure. It is useful for
+          creating multipoint L2 VPNs which can be later used inside bridge
+          interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
 config IP_MROUTE
         bool "IP: multicast routing"
         depends on IP_MULTICAST
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c        2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/ip_gre.c        2012-01-17 22:58:43.000000000 +0100
@@ -52,6 +52,11 @@ 
 #include <net/ip6_route.h>
 #endif
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
 /*
    Problems & solutions
    --------------------
@@ -134,6 +139,172 @@  struct ipgre_net {
         struct net_device *fb_tunnel_dev;
 };
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        /*
+         * This part of code includes codes to enable L2 ethernet
+         * switch virtualization over IP routed infrastructure with
+         * utilization of multicast capable endpoint using Ethernet
+         * over GRE
+         *
+         * Author: Stefan Gula
+         * Signed-off-by: Stefan Gula <steweg@gmail.com>
+         */
+struct ipgre_tap_bridge_entry {
+        struct hlist_node        hlist;
+        __be32                        raddr;
+        unsigned char                addr[ETH_ALEN];
+        unsigned long                updated;
+        struct rcu_head                rcu;
+};
+
+static u32 ipgre_salt __read_mostly;
+
+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+        u32 key = get_unaligned((u32 *)(mac + 2));
+
+        return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+                                const struct ipgre_tap_bridge_entry *entry)
+{
+        return time_before_eq(entry->updated + tunnel->ageing_time,
+                                jiffies);
+}
+
+static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry)
+{
+        hlist_del_rcu(&entry->hlist);
+        kfree_rcu(entry, rcu);
+}
+
+static void ipgre_tap_bridge_cleanup(unsigned long _data)
+{
+        struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+        unsigned long delay = tunnel->ageing_time;
+        unsigned long next_timer = jiffies + tunnel->ageing_time;
+        int i;
+
+        spin_lock(&tunnel->hash_lock);
+        for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+                struct ipgre_tap_bridge_entry *entry;
+                struct hlist_node *h, *n;
+
+                hlist_for_each_entry_safe(entry, h, n,
+                        &tunnel->hash[i], hlist)
+                {
+                        unsigned long this_timer;
+                        this_timer = entry->updated + delay;
+                        if (time_before_eq(this_timer, jiffies))
+                                ipgre_tap_bridge_delete(entry);
+                        else if (time_before(this_timer, next_timer))
+                                next_timer = this_timer;
+                }
+        }
+        spin_unlock(&tunnel->hash_lock);
+        mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));
+}
+
+static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
+{
+        int i;
+
+        spin_lock_bh(&tunnel->hash_lock);
+        for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+                struct ipgre_tap_bridge_entry *entry;
+                struct hlist_node *h, *n;
+
+                hlist_for_each_entry_safe(entry, h, n,
+                        &tunnel->hash[i], hlist)
+                {
+                        ipgre_tap_bridge_delete(entry);
+                }
+        }
+        spin_unlock_bh(&tunnel->hash_lock);
+}
+
+static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(
+        struct ip_tunnel *tunnel, const unsigned char *addr)
+{
+        struct hlist_node *h;
+        struct ipgre_tap_bridge_entry *entry;
+
+        hlist_for_each_entry_rcu(entry, h,
+                        &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) {
+                if (!compare_ether_addr(entry->addr, addr)) {
+                        if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+                                entry)))
+                                break;
+                        return entry;
+                }
+        }
+
+        return NULL;
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+        struct hlist_head *head,
+        const unsigned char *addr)
+{
+        struct hlist_node *h;
+        struct ipgre_tap_bridge_entry *entry;
+
+        hlist_for_each_entry(entry, h, head, hlist) {
+                if (!compare_ether_addr(entry->addr, addr))
+                        return entry;
+        }
+        return NULL;
+}
+
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
+        struct hlist_head *head,
+        const unsigned char *addr)
+{
+        struct hlist_node *h;
+        struct ipgre_tap_bridge_entry *entry;
+
+        hlist_for_each_entry_rcu(entry, h, head, hlist) {
+                if (!compare_ether_addr(entry->addr, addr))
+                        return entry;
+        }
+        return NULL;
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+        struct hlist_head *head,
+        __be32 source,
+        const unsigned char *addr)
+{
+        struct ipgre_tap_bridge_entry *entry;
+
+        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+        if (entry) {
+                memcpy(entry->addr, addr, ETH_ALEN);
+                entry->raddr = source;
+                entry->updated = jiffies;
+                hlist_add_head_rcu(&entry->hlist, head);
+        }
+        return entry;
+}
+
+static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+        const unsigned char *addr)
+{
+        __be32 raddr = 0;
+        struct ipgre_tap_bridge_entry *entry;
+
+        rcu_read_lock();
+        entry = __ipgre_tap_bridge_get(tunnel, addr);
+        if (entry)
+                raddr = entry->raddr;
+        rcu_read_unlock();
+
+        return raddr;
+}
+
+#endif
 /* Tunnel hash table */
 
 /*
@@ -562,6 +733,12 @@  static int ipgre_rcv(struct sk_buff *skb
         struct ip_tunnel *tunnel;
         int    offset = 4;
         __be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        __be32 orig_source;
+        struct hlist_head *head;
+        struct ipgre_tap_bridge_entry *entry;
+        const struct ethhdr *tethhdr;
+#endif
 
         if (!pskb_may_pull(skb, 16))
                 goto drop_nolock;
@@ -659,10 +836,38 @@  static int ipgre_rcv(struct sk_buff *skb
                                 tunnel->dev->stats.rx_errors++;
                                 goto drop;
                         }
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                        orig_source = iph->saddr;
+#endif
                         iph = ip_hdr(skb);
                         skb->protocol = eth_type_trans(skb, tunnel->dev);
                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                        if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+                                tethhdr = eth_hdr(skb);
+                                if (!is_multicast_ether_addr(
+                                        tethhdr->h_source)) {
+                                        head = &tunnel->hash[
+                                                ipgre_tap_bridge_hash(
+                                                        tethhdr->h_source)];
+                                        entry = ipgre_tap_bridge_find_rcu(head,
+                                                tethhdr->h_source);
+                                        if (likely(entry)) {
+                                                entry->raddr = orig_source;
+                                                entry->updated = jiffies;
+                                        } else {
+                                          spin_lock(&tunnel->hash_lock);
+                                          if (!ipgre_tap_bridge_find(head,
+                                                tethhdr->h_source))
+                                                ipgre_tap_bridge_create(
+                                                        head,
+                                                        orig_source,
+                                                        tethhdr->h_source);
+                                          spin_unlock(&tunnel->hash_lock);
+                                        }
+                                }
+                        }
+#endif
                 }
 
                 tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -702,7 +907,7 @@  static netdev_tx_t ipgre_tunnel_xmit(str
         struct iphdr  *iph;                        /* Our new IP header */
         unsigned int max_headroom;                /* The extra header space needed */
         int    gre_hlen;
-        __be32 dst;
+        __be32 dst = 0;
         int    mtu;
 
         if (dev->type == ARPHRD_ETHER)
@@ -716,7 +921,15 @@  static netdev_tx_t ipgre_tunnel_xmit(str
                 tiph = &tunnel->parms.iph;
         }
 
-        if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        if ((dev->type == ARPHRD_ETHER) &&
+                ipv4_is_multicast(tunnel->parms.iph.daddr))
+                dst = ipgre_tap_bridge_get_raddr(tunnel,
+                        ((struct ethhdr *)skb->data)->h_dest);
+#endif
+        if (dst == 0)
+                dst = tiph->daddr;
+        if (dst == 0) {
                 /* NBMA tunnel */
 
                 if (skb_dst(skb) == NULL) {
@@ -1209,6 +1422,16 @@  static int ipgre_open(struct net_device 
                         return -EADDRNOTAVAIL;
                 t->mlink = dev->ifindex;
                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                if (t->dev->type == ARPHRD_ETHER) {
+                        INIT_HLIST_HEAD(t->hash);
+                        spin_lock_init(&t->hash_lock);
+                        t->ageing_time = 300 * HZ;
+                        setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+                                (unsigned long) t);
+                        mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+                }
+#endif
         }
         return 0;
 }
@@ -1219,6 +1442,12 @@  static int ipgre_close(struct net_device
 
         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
                 struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                if (t->dev->type == ARPHRD_ETHER) {
+                        ipgre_tap_bridge_flush(t);
+                        del_timer_sync(&t->gc_timer);
+                }
+#endif
                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
                 if (in_dev)
                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
@@ -1488,6 +1717,10 @@  static int ipgre_tap_init(struct net_dev
 static const struct net_device_ops ipgre_tap_netdev_ops = {
         .ndo_init                = ipgre_tap_init,
         .ndo_uninit                = ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        .ndo_open                = ipgre_open,
+        .ndo_stop                = ipgre_close,
+#endif
         .ndo_start_xmit                = ipgre_tunnel_xmit,
         .ndo_set_mac_address         = eth_mac_addr,
         .ndo_validate_addr        = eth_validate_addr,
@@ -1705,6 +1938,9 @@  static int __init ipgre_init(void)
 
         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
 
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
         err = register_pernet_device(&ipgre_net_ops);
         if (err < 0)
                 return err;