Message ID | 20101129.134403.179947389.davem@davemloft.net |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
On Mon, 29 Nov 2010 13:44:03 -0800 (PST)
David Miller <davem@davemloft.net> wrote:
> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)
Let the compiler decide about inline.
Should be const inet_peer_address_t *
On 11/29/2010 04:44 PM, David Miller wrote: > +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b) > +{ > + if (a->family == b->family) { > + switch (a->family) { > + case AF_INET: > + if (a->a4 == b->a4) > + return true; > + break; > + case AF_INET6: > + if (!ipv6_addr_cmp((struct in6_addr *)a, > + (struct in6_addr *)b)) > + return true; > + break; ipv6_addr_equal() ? -Brian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 30, 2010 at 5:44 AM, David Miller <davem@davemloft.net> wrote: > > Signed-off-by: David S. Miller <davem@davemloft.net> > --- > include/net/inetpeer.h | 21 ++++++++++++++- > net/ipv4/inetpeer.c | 66 ++++++++++++++++++++++++++++++++++++++++------- > net/ipv4/ip_fragment.c | 2 +- > net/ipv4/route.c | 2 +- > net/ipv4/tcp_ipv4.c | 6 ++-- > 5 files changed, 80 insertions(+), 17 deletions(-) > > diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h > index fe239bf..834f045 100644 > --- a/include/net/inetpeer.h > +++ b/include/net/inetpeer.h > @@ -13,10 +13,18 @@ > #include <linux/spinlock.h> > #include <asm/atomic.h> > > +typedef struct { > + union { > + __be32 a4; > + __be32 a6[4]; > + }; > + __u16 family; > +} inet_peer_address_t; > + This makes inet_peer large even though ipv6 isn't enabled. > struct inet_peer { > /* group together avl_left,avl_right,v4daddr to speedup lookups */ > struct inet_peer __rcu *avl_left, *avl_right; > - __be32 v4daddr; /* peer's address */ > + inet_peer_address_t daddr; I have thought about converting this AVL tree to rbtree. When I saw the comment above, I gave it up, because rbtree makes this structure bigger. If ipv6 support is added, I think it is time to turn to rbtree. :) > __u32 avl_height; > struct list_head unused; > __u32 dtime; /* the time of last use of not > @@ -42,7 +50,16 @@ struct inet_peer { > void inet_initpeers(void) __init; > > /* can be called with or without local BH being disabled */ > -struct inet_peer *inet_getpeer(__be32 daddr, int create); > +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create); > + > +static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create) > +{ > + inet_peer_address_t daddr; > + > + daddr.a4 = v4daddr; > + daddr.family = AF_INET; > + return inet_getpeer(&daddr, create); > +} > > /* can be called from BH context or outside */ > extern void inet_putpeer(struct inet_peer *p); > diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c > index 9e94d7c..39f18c5 100644 > --- a/net/ipv4/inetpeer.c > +++ b/net/ipv4/inetpeer.c > @@ -19,6 +19,7 @@ > #include <linux/net.h> > #include <net/ip.h> > #include <net/inetpeer.h> > +#include <net/ipv6.h> > > /* > * Theory of operations. > @@ -152,11 +153,53 @@ static void unlink_from_unused(struct inet_peer *p) > } > } > > +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b) > +{ > + if (a->family == b->family) { > + switch (a->family) { > + case AF_INET: > + if (a->a4 == b->a4) > + return true; > + break; > + case AF_INET6: > + if (!ipv6_addr_cmp((struct in6_addr *)a, > + (struct in6_addr *)b)) > + return true; > + break; > + default: > + break; > + } > + } > + return false; > +} > + > +static inline u32 inet_peer_key(inet_peer_address_t *a) > +{ > + u32 key; > + > + switch (a->family) { > + case AF_INET: > + key = (__force __u32) a->a4; > + break; > + case AF_INET6: > + key = ((__force __u32)a->a6[0] ^ > + (__force __u32)a->a6[1] ^ > + (__force __u32)a->a6[2] ^ > + (__force __u32)a->a6[3]); > + break; > + default: > + key = 0; > + break; > + } > + return key; > +} > + > /* > * Called with local BH disabled and the pool lock held. > */ > #define lookup(_daddr, _stack) \ > ({ \ > + u32 key = inet_peer_key(_daddr); \ > struct inet_peer *u; \ > struct inet_peer __rcu **v; \ > \ > @@ -165,9 +208,9 @@ static void unlink_from_unused(struct inet_peer *p) > for (u = rcu_dereference_protected(peers.root, \ > lockdep_is_held(&peers.lock)); \ > u != peer_avl_empty; ) { \ > - if (_daddr == u->v4daddr) \ > + if (inet_peer_addr_equal(_daddr, &u->daddr)) \ > break; \ > - if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ > + if (key < inet_peer_key(&u->daddr)) \ > v = &u->avl_left; \ > else \ > v = &u->avl_right; \ > @@ -185,13 +228,14 @@ static void unlink_from_unused(struct inet_peer *p) > * But every pointer we follow is guaranteed to be valid thanks to RCU. > * We exit from this function if number of links exceeds PEER_MAXDEPTH > */ > -static struct inet_peer *lookup_rcu_bh(__be32 daddr) > +static struct inet_peer *lookup_rcu_bh(inet_peer_address_t *daddr) > { > struct inet_peer *u = rcu_dereference_bh(peers.root); > + u32 key = inet_peer_key(daddr); > int count = 0; > > while (u != peer_avl_empty) { > - if (daddr == u->v4daddr) { > + if (inet_peer_addr_equal(daddr, &u->daddr)) { > /* Before taking a reference, check if this entry was > * deleted, unlink_from_pool() sets refcnt=-1 to make > * distinction between an unused entry (refcnt=0) and > @@ -201,7 +245,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) > u = NULL; > return u; > } > - if ((__force __u32)daddr < (__force __u32)u->v4daddr) > + if (key < inet_peer_key(&u->daddr)) > u = rcu_dereference_bh(u->avl_left); > else > u = rcu_dereference_bh(u->avl_right); > @@ -353,7 +397,7 @@ static void unlink_from_pool(struct inet_peer *p) > if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { > struct inet_peer __rcu **stack[PEER_MAXDEPTH]; > struct inet_peer __rcu ***stackptr, ***delp; > - if (lookup(p->v4daddr, stack) != p) > + if (lookup(&p->daddr, stack) != p) > BUG(); > delp = stackptr - 1; /* *delp[0] == p */ > if (p->avl_left == peer_avl_empty_rcu) { > @@ -366,7 +410,7 @@ static void unlink_from_pool(struct inet_peer *p) > BUG_ON(rcu_dereference_protected(*stackptr[-1], > lockdep_is_held(&peers.lock)) != t); > **--stackptr = t->avl_left; > - /* t is removed, t->v4daddr > x->v4daddr for any > + /* t is removed, t->daddr > x->daddr for any > * x in p->avl_left subtree. > * Put t in the old place of p. */ > RCU_INIT_POINTER(*delp[0], t); > @@ -433,7 +477,7 @@ static int cleanup_once(unsigned long ttl) > } > > /* Called with or without local BH being disabled. */ > -struct inet_peer *inet_getpeer(__be32 daddr, int create) > +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create) > { > struct inet_peer *p; > struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; > @@ -467,10 +511,12 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) > } > p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; > if (p) { > - p->v4daddr = daddr; > + p->daddr = *daddr; > atomic_set(&p->refcnt, 1); > atomic_set(&p->rid, 0); > - atomic_set(&p->ip_id_count, secure_ip_id(daddr)); > + atomic_set(&p->ip_id_count, > + (daddr->family == AF_INET) ? > + secure_ip_id(daddr->a4) : 0); > p->tcp_ts_stamp = 0; > INIT_LIST_HEAD(&p->unused); > > diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c > index 1684408..e6215bd 100644 > --- a/net/ipv4/ip_fragment.c > +++ b/net/ipv4/ip_fragment.c > @@ -141,7 +141,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) > qp->daddr = arg->iph->daddr; > qp->user = arg->user; > qp->peer = sysctl_ipfrag_max_dist ? > - inet_getpeer(arg->iph->saddr, 1) : NULL; > + inet_getpeer_v4(arg->iph->saddr, 1) : NULL; > } > > static __inline__ void ip4_frag_free(struct inet_frag_queue *q) > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > index ec2333f..3843c2d 100644 > --- a/net/ipv4/route.c > +++ b/net/ipv4/route.c > @@ -1289,7 +1289,7 @@ void rt_bind_peer(struct rtable *rt, int create) > { > struct inet_peer *peer; > > - peer = inet_getpeer(rt->rt_dst, create); > + peer = inet_getpeer_v4(rt->rt_dst, create); > > if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) > inet_putpeer(peer); > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index 69ccbc1..00285fc 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -1347,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) > tcp_death_row.sysctl_tw_recycle && > (dst = inet_csk_route_req(sk, req)) != NULL && > (peer = rt_get_peer((struct rtable *)dst)) != NULL && > - peer->v4daddr == saddr) { > + peer->daddr.a4 == saddr) { > inet_peer_refcheck(peer); > if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && > (s32)(peer->tcp_ts - req->ts_recent) > > @@ -1778,7 +1778,7 @@ int tcp_v4_remember_stamp(struct sock *sk) > int release_it = 0; > > if (!rt || rt->rt_dst != inet->inet_daddr) { > - peer = inet_getpeer(inet->inet_daddr, 1); > + peer = inet_getpeer_v4(inet->inet_daddr, 1); > release_it = 1; > } else { > if (!rt->peer) > @@ -1804,7 +1804,7 @@ EXPORT_SYMBOL(tcp_v4_remember_stamp); > > int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) > { > - struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); > + struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); > > if (peer) { > const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); > -- > 1.7.3.2 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
From: Changli Gao <xiaosuo@gmail.com> Date: Tue, 30 Nov 2010 10:33:49 +0800 > I have thought about converting this AVL tree to rbtree. When I saw > the comment above, I gave it up, because rbtree makes this structure > bigger. If ipv6 support is added, I think it is time to turn to > rbtree. :) If it takes size over 128 bytes, it is probably still a bad idea. Right now it is just under 128. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Stephen Hemminger <shemminger@vyatta.com> Date: Mon, 29 Nov 2010 14:33:57 -0800 > On Mon, 29 Nov 2010 13:44:03 -0800 (PST) > David Miller <davem@davemloft.net> wrote: > >> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b) > > Let the compiler decide about inline. > Should be const inet_peer_address_t * Yep, I made those changes thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Brian Haley <brian.haley@hp.com> Date: Mon, 29 Nov 2010 21:11:55 -0500 > On 11/29/2010 04:44 PM, David Miller wrote: >> + case AF_INET6: >> + if (!ipv6_addr_cmp((struct in6_addr *)a, >> + (struct in6_addr *)b)) >> + return true; >> + break; > > ipv6_addr_equal() ? Yep, makes sense to me. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote: > From: Changli Gao <xiaosuo@gmail.com> > Date: Tue, 30 Nov 2010 10:33:49 +0800 > >> I have thought about converting this AVL tree to rbtree. When I saw >> the comment above, I gave it up, because rbtree makes this structure >> bigger. If ipv6 support is added, I think it is time to turn to >> rbtree. :) > > If it takes size over 128 bytes, it is probably still a bad idea. > Right now it is just under 128. > Why 128? The current size of inet_peer is 64 bytes, and after your ipv6 patch, it is just 80. And, inet_peer is allocated from its own mem_cache, so the size of the memory for it should not be aligned to 2^n. If rbtree is used instead, extra 8 bytes will be needed, and the size of inet_peer will be 88.
From: Changli Gao <xiaosuo@gmail.com> Date: Tue, 30 Nov 2010 12:51:08 +0800 > On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote: >> From: Changli Gao <xiaosuo@gmail.com> >> Date: Tue, 30 Nov 2010 10:33:49 +0800 >> >>> I have thought about converting this AVL tree to rbtree. When I saw >>> the comment above, I gave it up, because rbtree makes this structure >>> bigger. If ipv6 support is added, I think it is time to turn to >>> rbtree. :) >> >> If it takes size over 128 bytes, it is probably still a bad idea. >> Right now it is just under 128. >> > > Why 128? The current size of inet_peer is 64 bytes, and after your > ipv6 patch, it is just 80. And, inet_peer is allocated from its own > mem_cache, so the size of the memory for it should not be aligned to > 2^n. Sorry, I was taking into the consideration other work I am doing which will move all of the routing metrics into inet_peer as well. With ipv6 address support, it fits perfectly into 128 bytes on 64-bit. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 29 novembre 2010 à 21:22 -0800, David Miller a écrit : > From: Changli Gao <xiaosuo@gmail.com> > Date: Tue, 30 Nov 2010 12:51:08 +0800 > > > On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote: > >> From: Changli Gao <xiaosuo@gmail.com> > >> Date: Tue, 30 Nov 2010 10:33:49 +0800 > >> > >>> I have thought about converting this AVL tree to rbtree. When I saw > >>> the comment above, I gave it up, because rbtree makes this structure > >>> bigger. If ipv6 support is added, I think it is time to turn to > >>> rbtree. :) > >> > >> If it takes size over 128 bytes, it is probably still a bad idea. > >> Right now it is just under 128. > >> > > > > Why 128? The current size of inet_peer is 64 bytes, and after your > > ipv6 patch, it is just 80. And, inet_peer is allocated from its own > > mem_cache, so the size of the memory for it should not be aligned to > > 2^n. > > Sorry, I was taking into the consideration other work I am doing > which will move all of the routing metrics into inet_peer as well. > > With ipv6 address support, it fits perfectly into 128 bytes on > 64-bit. > -- Its a bit early in the morning here, I must confess I dont yet understand your patch David :) As we use a tree, why not using two different trees for ipv4 / ipv6 ? I dont understand how computing a 32bit key (sort of hash key) is going to help when hash collision happens, with an avl tree. For Changli : AVL tree was OK for RCU conversion, I am not sure about RBtree yet. Either version of tree (AVL/rbtree) will be expensive to use if depth is big (With 2 millions entries, depth is going to be very big). I understand you want to get rid of route cache ? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Tue, 30 Nov 2010 06:42:16 +0100 > Its a bit early in the morning here, I must confess I dont yet > understand your patch David :) > > As we use a tree, why not using two different trees for ipv4 / ipv6 ? The "key" just creates a natural ordering in the tree, it's almost arbitrary except that it must distribute well amongst the entries. I currently don't see any reason to make two trees right now. > I dont understand how computing a 32bit key (sort of hash key) is going > to help when hash collision happens, with an avl tree. > Either version of tree (AVL/rbtree) will be expensive to use if depth is > big (With 2 millions entries, depth is going to be very big). I > understand you want to get rid of route cache ? Do we plan to talk to 2 million unique destinations and have active non-default metrics for each one of them very often? inet_peer entries will only get created when we need to make non-default metric settings for a specific destination address. See that's the thing, it's scope is so much smaller than the existing routing cache. It's only going to be used in limited if not controlled cases. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 29 novembre 2010 à 21:53 -0800, David Miller a écrit : > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Tue, 30 Nov 2010 06:42:16 +0100 > > > Its a bit early in the morning here, I must confess I dont yet > > understand your patch David :) > > > > As we use a tree, why not using two different trees for ipv4 / ipv6 ? > > The "key" just creates a natural ordering in the tree, it's > almost arbitrary except that it must distribute well amongst > the entries. Hmm. AVL search must take a decision, take the left or the right path. if current key is equal, which path do you take ? @@ -165,9 +208,9 @@ static void unlink_from_unused(struct inet_peer *p) > for (u = rcu_dereference_protected(peers.root, \ > lockdep_is_held(&peers.lock)); \ > u != peer_avl_empty; ) { \ > - if (_daddr == u->v4daddr) \ > + if (inet_peer_addr_equal(_daddr, &u->daddr)) \ > break; \ > - if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ > + if (key < inet_peer_key(&u->daddr)) \ > v = &u->avl_left; \ > else \ > v = &u->avl_right; \ Apparently you take the right one, you may miss the target if its on the left path ? > > I currently don't see any reason to make two trees right now. > Cost of a tree is one pointer, and ipv4 search would be faster if we use different search functions. > > I dont understand how computing a 32bit key (sort of hash key) is going > > to help when hash collision happens, with an avl tree. > > Either version of tree (AVL/rbtree) will be expensive to use if depth is > > big (With 2 millions entries, depth is going to be very big). I > > understand you want to get rid of route cache ? > > Do we plan to talk to 2 million unique destinations and have active > non-default metrics for each one of them very often? > > inet_peer entries will only get created when we need to make > non-default metric settings for a specific destination address. > > See that's the thing, it's scope is so much smaller than the existing > routing cache. It's only going to be used in limited if not > controlled cases. > OK good :) I have no idea how many addresses have non default metric settings. Do you know how to make an estimation on a server ? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Tue, 30 Nov 2010 07:11:19 +0100 > Le lundi 29 novembre 2010 à 21:53 -0800, David Miller a écrit : >> From: Eric Dumazet <eric.dumazet@gmail.com> >> Date: Tue, 30 Nov 2010 06:42:16 +0100 >> >> > Its a bit early in the morning here, I must confess I dont yet >> > understand your patch David :) >> > >> > As we use a tree, why not using two different trees for ipv4 / ipv6 ? >> >> The "key" just creates a natural ordering in the tree, it's >> almost arbitrary except that it must distribute well amongst >> the entries. > > Hmm. AVL search must take a decision, take the left or the right path. > > if current key is equal, which path do you take ? Right. :-) But yes there is some error that needs to be handled in that equal keys can lead to imporper paths as you showed. I guess I do need to make a seperate tree, how disappointing. Anyways I will work on fixing this. > Do you know how to make an estimation on a server ? No way to do it straightforward I guess because right now TCP metrics key on all sorts of rediculious things like source address, TOS, and other crap as a consequence of how the routing cache works. So much replicated information and wasted memory. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index fe239bf..834f045 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -13,10 +13,18 @@ #include <linux/spinlock.h> #include <asm/atomic.h> +typedef struct { + union { + __be32 a4; + __be32 a6[4]; + }; + __u16 family; +} inet_peer_address_t; + struct inet_peer { /* group together avl_left,avl_right,v4daddr to speedup lookups */ struct inet_peer __rcu *avl_left, *avl_right; - __be32 v4daddr; /* peer's address */ + inet_peer_address_t daddr; __u32 avl_height; struct list_head unused; __u32 dtime; /* the time of last use of not @@ -42,7 +50,16 @@ struct inet_peer { void inet_initpeers(void) __init; /* can be called with or without local BH being disabled */ -struct inet_peer *inet_getpeer(__be32 daddr, int create); +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create); + +static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create) +{ + inet_peer_address_t daddr; + + daddr.a4 = v4daddr; + daddr.family = AF_INET; + return inet_getpeer(&daddr, create); +} /* can be called from BH context or outside */ extern void inet_putpeer(struct inet_peer *p); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9e94d7c..39f18c5 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -19,6 +19,7 @@ #include <linux/net.h> #include <net/ip.h> #include <net/inetpeer.h> +#include <net/ipv6.h> /* * Theory of operations. @@ -152,11 +153,53 @@ static void unlink_from_unused(struct inet_peer *p) } } +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b) +{ + if (a->family == b->family) { + switch (a->family) { + case AF_INET: + if (a->a4 == b->a4) + return true; + break; + case AF_INET6: + if (!ipv6_addr_cmp((struct in6_addr *)a, + (struct in6_addr *)b)) + return true; + break; + default: + break; + } + } + return false; +} + +static inline u32 inet_peer_key(inet_peer_address_t *a) +{ + u32 key; + + switch (a->family) { + case AF_INET: + key = (__force __u32) a->a4; + break; + case AF_INET6: + key = ((__force __u32)a->a6[0] ^ + (__force __u32)a->a6[1] ^ + (__force __u32)a->a6[2] ^ + (__force __u32)a->a6[3]); + break; + default: + key = 0; + break; + } + return key; +} + /* * Called with local BH disabled and the pool lock held. */ #define lookup(_daddr, _stack) \ ({ \ + u32 key = inet_peer_key(_daddr); \ struct inet_peer *u; \ struct inet_peer __rcu **v; \ \ @@ -165,9 +208,9 @@ static void unlink_from_unused(struct inet_peer *p) for (u = rcu_dereference_protected(peers.root, \ lockdep_is_held(&peers.lock)); \ u != peer_avl_empty; ) { \ - if (_daddr == u->v4daddr) \ + if (inet_peer_addr_equal(_daddr, &u->daddr)) \ break; \ - if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ + if (key < inet_peer_key(&u->daddr)) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ @@ -185,13 +228,14 @@ static void unlink_from_unused(struct inet_peer *p) * But every pointer we follow is guaranteed to be valid thanks to RCU. * We exit from this function if number of links exceeds PEER_MAXDEPTH */ -static struct inet_peer *lookup_rcu_bh(__be32 daddr) +static struct inet_peer *lookup_rcu_bh(inet_peer_address_t *daddr) { struct inet_peer *u = rcu_dereference_bh(peers.root); + u32 key = inet_peer_key(daddr); int count = 0; while (u != peer_avl_empty) { - if (daddr == u->v4daddr) { + if (inet_peer_addr_equal(daddr, &u->daddr)) { /* Before taking a reference, check if this entry was * deleted, unlink_from_pool() sets refcnt=-1 to make * distinction between an unused entry (refcnt=0) and @@ -201,7 +245,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) u = NULL; return u; } - if ((__force __u32)daddr < (__force __u32)u->v4daddr) + if (key < inet_peer_key(&u->daddr)) u = rcu_dereference_bh(u->avl_left); else u = rcu_dereference_bh(u->avl_right); @@ -353,7 +397,7 @@ static void unlink_from_pool(struct inet_peer *p) if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { struct inet_peer __rcu **stack[PEER_MAXDEPTH]; struct inet_peer __rcu ***stackptr, ***delp; - if (lookup(p->v4daddr, stack) != p) + if (lookup(&p->daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ if (p->avl_left == peer_avl_empty_rcu) { @@ -366,7 +410,7 @@ static void unlink_from_pool(struct inet_peer *p) BUG_ON(rcu_dereference_protected(*stackptr[-1], lockdep_is_held(&peers.lock)) != t); **--stackptr = t->avl_left; - /* t is removed, t->v4daddr > x->v4daddr for any + /* t is removed, t->daddr > x->daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ RCU_INIT_POINTER(*delp[0], t); @@ -433,7 +477,7 @@ static int cleanup_once(unsigned long ttl) } /* Called with or without local BH being disabled. */ -struct inet_peer *inet_getpeer(__be32 daddr, int create) +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create) { struct inet_peer *p; struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; @@ -467,10 +511,12 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { - p->v4daddr = daddr; + p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr)); + atomic_set(&p->ip_id_count, + (daddr->family == AF_INET) ? + secure_ip_id(daddr->a4) : 0); p->tcp_ts_stamp = 0; INIT_LIST_HEAD(&p->unused); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1684408..e6215bd 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -141,7 +141,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ec2333f..3843c2d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1289,7 +1289,7 @@ void rt_bind_peer(struct rtable *rt, int create) { struct inet_peer *peer; - peer = inet_getpeer(rt->rt_dst, create); + peer = inet_getpeer_v4(rt->rt_dst, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1..00285fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1347,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->v4daddr == saddr) { + peer->daddr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > @@ -1778,7 +1778,7 @@ int tcp_v4_remember_stamp(struct sock *sk) int release_it = 0; if (!rt || rt->rt_dst != inet->inet_daddr) { - peer = inet_getpeer(inet->inet_daddr, 1); + peer = inet_getpeer_v4(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -1804,7 +1804,7 @@ EXPORT_SYMBOL(tcp_v4_remember_stamp); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); if (peer) { const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/inetpeer.h | 21 ++++++++++++++- net/ipv4/inetpeer.c | 66 ++++++++++++++++++++++++++++++++++++++++------- net/ipv4/ip_fragment.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp_ipv4.c | 6 ++-- 5 files changed, 80 insertions(+), 17 deletions(-)