Patchwork [1/3] inetpeer: Support ipv6 addresses.

login
register
mail settings
Submitter David Miller
Date Nov. 29, 2010, 9:44 p.m.
Message ID <20101129.134403.179947389.davem@davemloft.net>
Download mbox | patch
Permalink /patch/73498/
State Superseded
Delegated to: David Miller
Headers show

Comments

David Miller - Nov. 29, 2010, 9:44 p.m.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |   21 ++++++++++++++-
 net/ipv4/inetpeer.c    |   66 ++++++++++++++++++++++++++++++++++++++++-------
 net/ipv4/ip_fragment.c |    2 +-
 net/ipv4/route.c       |    2 +-
 net/ipv4/tcp_ipv4.c    |    6 ++--
 5 files changed, 80 insertions(+), 17 deletions(-)
stephen hemminger - Nov. 29, 2010, 10:33 p.m.
On Mon, 29 Nov 2010 13:44:03 -0800 (PST)
David Miller <davem@davemloft.net> wrote:

> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)

Let the compiler decide about inline.
Should be const inet_peer_address_t *
Brian Haley - Nov. 30, 2010, 2:11 a.m.
On 11/29/2010 04:44 PM, David Miller wrote: 
> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)
> +{
> +	if (a->family == b->family) {
> +		switch (a->family) {
> +		case AF_INET:
> +			if (a->a4 == b->a4)
> +				return true;
> +			break;
> +		case AF_INET6:
> +			if (!ipv6_addr_cmp((struct in6_addr *)a,
> +					   (struct in6_addr *)b))
> +				return true;
> +			break;

ipv6_addr_equal() ?

-Brian

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Nov. 30, 2010, 2:33 a.m.
On Tue, Nov 30, 2010 at 5:44 AM, David Miller <davem@davemloft.net> wrote:
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  include/net/inetpeer.h |   21 ++++++++++++++-
>  net/ipv4/inetpeer.c    |   66 ++++++++++++++++++++++++++++++++++++++++-------
>  net/ipv4/ip_fragment.c |    2 +-
>  net/ipv4/route.c       |    2 +-
>  net/ipv4/tcp_ipv4.c    |    6 ++--
>  5 files changed, 80 insertions(+), 17 deletions(-)
>
> diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
> index fe239bf..834f045 100644
> --- a/include/net/inetpeer.h
> +++ b/include/net/inetpeer.h
> @@ -13,10 +13,18 @@
>  #include <linux/spinlock.h>
>  #include <asm/atomic.h>
>
> +typedef struct {
> +       union {
> +               __be32          a4;
> +               __be32          a6[4];
> +       };
> +       __u16   family;
> +} inet_peer_address_t;
> +

This makes inet_peer large even though ipv6 isn't enabled.

>  struct inet_peer {
>        /* group together avl_left,avl_right,v4daddr to speedup lookups */
>        struct inet_peer __rcu  *avl_left, *avl_right;
> -       __be32                  v4daddr;        /* peer's address */
> +       inet_peer_address_t     daddr;

I have thought about converting this AVL tree to rbtree. When I saw
the comment above, I gave it up, because rbtree makes this structure
bigger. If ipv6 support is added, I think it is time to turn to
rbtree. :)

>        __u32                   avl_height;
>        struct list_head        unused;
>        __u32                   dtime;          /* the time of last use of not
> @@ -42,7 +50,16 @@ struct inet_peer {
>  void                   inet_initpeers(void) __init;
>
>  /* can be called with or without local BH being disabled */
> -struct inet_peer       *inet_getpeer(__be32 daddr, int create);
> +struct inet_peer       *inet_getpeer(inet_peer_address_t *daddr, int create);
> +
> +static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
> +{
> +       inet_peer_address_t daddr;
> +
> +       daddr.a4 = v4daddr;
> +       daddr.family = AF_INET;
> +       return inet_getpeer(&daddr, create);
> +}
>
>  /* can be called from BH context or outside */
>  extern void inet_putpeer(struct inet_peer *p);
> diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
> index 9e94d7c..39f18c5 100644
> --- a/net/ipv4/inetpeer.c
> +++ b/net/ipv4/inetpeer.c
> @@ -19,6 +19,7 @@
>  #include <linux/net.h>
>  #include <net/ip.h>
>  #include <net/inetpeer.h>
> +#include <net/ipv6.h>
>
>  /*
>  *  Theory of operations.
> @@ -152,11 +153,53 @@ static void unlink_from_unused(struct inet_peer *p)
>        }
>  }
>
> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)
> +{
> +       if (a->family == b->family) {
> +               switch (a->family) {
> +               case AF_INET:
> +                       if (a->a4 == b->a4)
> +                               return true;
> +                       break;
> +               case AF_INET6:
> +                       if (!ipv6_addr_cmp((struct in6_addr *)a,
> +                                          (struct in6_addr *)b))
> +                               return true;
> +                       break;
> +               default:
> +                       break;
> +               }
> +       }
> +       return false;
> +}
> +
> +static inline u32 inet_peer_key(inet_peer_address_t *a)
> +{
> +       u32 key;
> +
> +       switch (a->family) {
> +       case AF_INET:
> +               key = (__force __u32) a->a4;
> +               break;
> +       case AF_INET6:
> +               key = ((__force __u32)a->a6[0] ^
> +                      (__force __u32)a->a6[1] ^
> +                      (__force __u32)a->a6[2] ^
> +                      (__force __u32)a->a6[3]);
> +               break;
> +       default:
> +               key = 0;
> +               break;
> +       }
> +       return key;
> +}
> +
>  /*
>  * Called with local BH disabled and the pool lock held.
>  */
>  #define lookup(_daddr, _stack)                                         \
>  ({                                                             \
> +       u32 key = inet_peer_key(_daddr);                        \
>        struct inet_peer *u;                                    \
>        struct inet_peer __rcu **v;                             \
>                                                                \
> @@ -165,9 +208,9 @@ static void unlink_from_unused(struct inet_peer *p)
>        for (u = rcu_dereference_protected(peers.root,          \
>                        lockdep_is_held(&peers.lock));          \
>             u != peer_avl_empty; ) {                           \
> -               if (_daddr == u->v4daddr)                       \
> +               if (inet_peer_addr_equal(_daddr, &u->daddr))    \
>                        break;                                  \
> -               if ((__force __u32)_daddr < (__force __u32)u->v4daddr)  \
> +               if (key < inet_peer_key(&u->daddr))             \
>                        v = &u->avl_left;                       \
>                else                                            \
>                        v = &u->avl_right;                      \
> @@ -185,13 +228,14 @@ static void unlink_from_unused(struct inet_peer *p)
>  * But every pointer we follow is guaranteed to be valid thanks to RCU.
>  * We exit from this function if number of links exceeds PEER_MAXDEPTH
>  */
> -static struct inet_peer *lookup_rcu_bh(__be32 daddr)
> +static struct inet_peer *lookup_rcu_bh(inet_peer_address_t *daddr)
>  {
>        struct inet_peer *u = rcu_dereference_bh(peers.root);
> +       u32 key = inet_peer_key(daddr);
>        int count = 0;
>
>        while (u != peer_avl_empty) {
> -               if (daddr == u->v4daddr) {
> +               if (inet_peer_addr_equal(daddr, &u->daddr)) {
>                        /* Before taking a reference, check if this entry was
>                         * deleted, unlink_from_pool() sets refcnt=-1 to make
>                         * distinction between an unused entry (refcnt=0) and
> @@ -201,7 +245,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
>                                u = NULL;
>                        return u;
>                }
> -               if ((__force __u32)daddr < (__force __u32)u->v4daddr)
> +               if (key < inet_peer_key(&u->daddr))
>                        u = rcu_dereference_bh(u->avl_left);
>                else
>                        u = rcu_dereference_bh(u->avl_right);
> @@ -353,7 +397,7 @@ static void unlink_from_pool(struct inet_peer *p)
>        if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
>                struct inet_peer __rcu **stack[PEER_MAXDEPTH];
>                struct inet_peer __rcu ***stackptr, ***delp;
> -               if (lookup(p->v4daddr, stack) != p)
> +               if (lookup(&p->daddr, stack) != p)
>                        BUG();
>                delp = stackptr - 1; /* *delp[0] == p */
>                if (p->avl_left == peer_avl_empty_rcu) {
> @@ -366,7 +410,7 @@ static void unlink_from_pool(struct inet_peer *p)
>                        BUG_ON(rcu_dereference_protected(*stackptr[-1],
>                                        lockdep_is_held(&peers.lock)) != t);
>                        **--stackptr = t->avl_left;
> -                       /* t is removed, t->v4daddr > x->v4daddr for any
> +                       /* t is removed, t->daddr > x->daddr for any
>                         * x in p->avl_left subtree.
>                         * Put t in the old place of p. */
>                        RCU_INIT_POINTER(*delp[0], t);
> @@ -433,7 +477,7 @@ static int cleanup_once(unsigned long ttl)
>  }
>
>  /* Called with or without local BH being disabled. */
> -struct inet_peer *inet_getpeer(__be32 daddr, int create)
> +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create)
>  {
>        struct inet_peer *p;
>        struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
> @@ -467,10 +511,12 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
>        }
>        p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
>        if (p) {
> -               p->v4daddr = daddr;
> +               p->daddr = *daddr;
>                atomic_set(&p->refcnt, 1);
>                atomic_set(&p->rid, 0);
> -               atomic_set(&p->ip_id_count, secure_ip_id(daddr));
> +               atomic_set(&p->ip_id_count,
> +                          (daddr->family == AF_INET) ?
> +                          secure_ip_id(daddr->a4) : 0);
>                p->tcp_ts_stamp = 0;
>                INIT_LIST_HEAD(&p->unused);
>
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index 1684408..e6215bd 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -141,7 +141,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
>        qp->daddr = arg->iph->daddr;
>        qp->user = arg->user;
>        qp->peer = sysctl_ipfrag_max_dist ?
> -               inet_getpeer(arg->iph->saddr, 1) : NULL;
> +               inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
>  }
>
>  static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index ec2333f..3843c2d 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1289,7 +1289,7 @@ void rt_bind_peer(struct rtable *rt, int create)
>  {
>        struct inet_peer *peer;
>
> -       peer = inet_getpeer(rt->rt_dst, create);
> +       peer = inet_getpeer_v4(rt->rt_dst, create);
>
>        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
>                inet_putpeer(peer);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 69ccbc1..00285fc 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1347,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
>                    tcp_death_row.sysctl_tw_recycle &&
>                    (dst = inet_csk_route_req(sk, req)) != NULL &&
>                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
> -                   peer->v4daddr == saddr) {
> +                   peer->daddr.a4 == saddr) {
>                        inet_peer_refcheck(peer);
>                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
>                            (s32)(peer->tcp_ts - req->ts_recent) >
> @@ -1778,7 +1778,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
>        int release_it = 0;
>
>        if (!rt || rt->rt_dst != inet->inet_daddr) {
> -               peer = inet_getpeer(inet->inet_daddr, 1);
> +               peer = inet_getpeer_v4(inet->inet_daddr, 1);
>                release_it = 1;
>        } else {
>                if (!rt->peer)
> @@ -1804,7 +1804,7 @@ EXPORT_SYMBOL(tcp_v4_remember_stamp);
>
>  int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
>  {
> -       struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
> +       struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1);
>
>        if (peer) {
>                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
> --
> 1.7.3.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
David Miller - Nov. 30, 2010, 3:14 a.m.
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 30 Nov 2010 10:33:49 +0800

> I have thought about converting this AVL tree to rbtree. When I saw
> the comment above, I gave it up, because rbtree makes this structure
> bigger. If ipv6 support is added, I think it is time to turn to
> rbtree. :)

If it takes size over 128 bytes, it is probably still a bad idea.
Right now it is just under 128.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Nov. 30, 2010, 3:18 a.m.
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 29 Nov 2010 14:33:57 -0800

> On Mon, 29 Nov 2010 13:44:03 -0800 (PST)
> David Miller <davem@davemloft.net> wrote:
> 
>> +static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)
> 
> Let the compiler decide about inline.
> Should be const inet_peer_address_t *

Yep, I made those changes thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Nov. 30, 2010, 3:18 a.m.
From: Brian Haley <brian.haley@hp.com>
Date: Mon, 29 Nov 2010 21:11:55 -0500

> On 11/29/2010 04:44 PM, David Miller wrote: 
>> +		case AF_INET6:
>> +			if (!ipv6_addr_cmp((struct in6_addr *)a,
>> +					   (struct in6_addr *)b))
>> +				return true;
>> +			break;
> 
> ipv6_addr_equal() ?

Yep, makes sense to me.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - Nov. 30, 2010, 4:51 a.m.
On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote:
> From: Changli Gao <xiaosuo@gmail.com>
> Date: Tue, 30 Nov 2010 10:33:49 +0800
>
>> I have thought about converting this AVL tree to rbtree. When I saw
>> the comment above, I gave it up, because rbtree makes this structure
>> bigger. If ipv6 support is added, I think it is time to turn to
>> rbtree. :)
>
> If it takes size over 128 bytes, it is probably still a bad idea.
> Right now it is just under 128.
>

Why 128? The current size of inet_peer is 64 bytes, and after your
ipv6 patch, it is just 80. And, inet_peer is allocated from its own
mem_cache, so the size of the memory for it should not be aligned to
2^n.

If rbtree is used instead, extra 8 bytes will be needed, and the size
of inet_peer will be 88.
David Miller - Nov. 30, 2010, 5:22 a.m.
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 30 Nov 2010 12:51:08 +0800

> On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote:
>> From: Changli Gao <xiaosuo@gmail.com>
>> Date: Tue, 30 Nov 2010 10:33:49 +0800
>>
>>> I have thought about converting this AVL tree to rbtree. When I saw
>>> the comment above, I gave it up, because rbtree makes this structure
>>> bigger. If ipv6 support is added, I think it is time to turn to
>>> rbtree. :)
>>
>> If it takes size over 128 bytes, it is probably still a bad idea.
>> Right now it is just under 128.
>>
> 
> Why 128? The current size of inet_peer is 64 bytes, and after your
> ipv6 patch, it is just 80. And, inet_peer is allocated from its own
> mem_cache, so the size of the memory for it should not be aligned to
> 2^n.

Sorry, I was taking into the consideration other work I am doing
which will move all of the routing metrics into inet_peer as well.

With ipv6 address support, it fits perfectly into 128 bytes on
64-bit.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Nov. 30, 2010, 5:42 a.m.
Le lundi 29 novembre 2010 à 21:22 -0800, David Miller a écrit :
> From: Changli Gao <xiaosuo@gmail.com>
> Date: Tue, 30 Nov 2010 12:51:08 +0800
> 
> > On Tue, Nov 30, 2010 at 11:14 AM, David Miller <davem@davemloft.net> wrote:
> >> From: Changli Gao <xiaosuo@gmail.com>
> >> Date: Tue, 30 Nov 2010 10:33:49 +0800
> >>
> >>> I have thought about converting this AVL tree to rbtree. When I saw
> >>> the comment above, I gave it up, because rbtree makes this structure
> >>> bigger. If ipv6 support is added, I think it is time to turn to
> >>> rbtree. :)
> >>
> >> If it takes size over 128 bytes, it is probably still a bad idea.
> >> Right now it is just under 128.
> >>
> > 
> > Why 128? The current size of inet_peer is 64 bytes, and after your
> > ipv6 patch, it is just 80. And, inet_peer is allocated from its own
> > mem_cache, so the size of the memory for it should not be aligned to
> > 2^n.
> 
> Sorry, I was taking into the consideration other work I am doing
> which will move all of the routing metrics into inet_peer as well.
> 
> With ipv6 address support, it fits perfectly into 128 bytes on
> 64-bit.
> --

Its a bit early in the morning here, I must confess I dont yet
understand your patch David :)

As we use a tree, why not using two different trees for ipv4 / ipv6 ?

I dont understand how computing a 32bit key (sort of hash key) is going
to help when hash collision happens, with an avl tree.


For Changli : AVL tree was OK for RCU conversion, I am not sure about
RBtree yet.

Either version of tree (AVL/rbtree) will be expensive to use if depth is
big (With 2 millions entries, depth is going to be very big). I
understand you want to get rid of route cache ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Nov. 30, 2010, 5:53 a.m.
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 30 Nov 2010 06:42:16 +0100

> Its a bit early in the morning here, I must confess I dont yet
> understand your patch David :)
> 
> As we use a tree, why not using two different trees for ipv4 / ipv6 ?

The "key" just creates a natural ordering in the tree, it's
almost arbitrary except that it must distribute well amongst
the entries.

I currently don't see any reason to make two trees right now.

> I dont understand how computing a 32bit key (sort of hash key) is going
> to help when hash collision happens, with an avl tree.
> Either version of tree (AVL/rbtree) will be expensive to use if depth is
> big (With 2 millions entries, depth is going to be very big). I
> understand you want to get rid of route cache ?

Do we plan to talk to 2 million unique destinations and have active
non-default metrics for each one of them very often?

inet_peer entries will only get created when we need to make
non-default metric settings for a specific destination address.

See that's the thing, it's scope is so much smaller than the existing
routing cache.  It's only going to be used in limited if not
controlled cases.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Nov. 30, 2010, 6:11 a.m.
Le lundi 29 novembre 2010 à 21:53 -0800, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 30 Nov 2010 06:42:16 +0100
> 
> > Its a bit early in the morning here, I must confess I dont yet
> > understand your patch David :)
> > 
> > As we use a tree, why not using two different trees for ipv4 / ipv6 ?
> 
> The "key" just creates a natural ordering in the tree, it's
> almost arbitrary except that it must distribute well amongst
> the entries.

Hmm. AVL search must take a decision, take the left or the right path.

if current key is equal, which path do you take ?


@@ -165,9 +208,9 @@ static void unlink_from_unused(struct inet_peer *p)
>  	for (u = rcu_dereference_protected(peers.root,		\
>  			lockdep_is_held(&peers.lock));		\
>  	     u != peer_avl_empty; ) {				\
> -		if (_daddr == u->v4daddr)			\
> +		if (inet_peer_addr_equal(_daddr, &u->daddr))	\
>  			break;					\
> -		if ((__force __u32)_daddr < (__force __u32)u->v4daddr)	\
> +		if (key < inet_peer_key(&u->daddr))		\
>  			v = &u->avl_left;			\
>  		else						\
>  			v = &u->avl_right;			\


Apparently you take the right one, you may miss the target if its on the
left path ?

> 
> I currently don't see any reason to make two trees right now.
> 

Cost of a tree is one pointer, and ipv4 search would be faster if we use
different search functions.

> > I dont understand how computing a 32bit key (sort of hash key) is going
> > to help when hash collision happens, with an avl tree.
> > Either version of tree (AVL/rbtree) will be expensive to use if depth is
> > big (With 2 millions entries, depth is going to be very big). I
> > understand you want to get rid of route cache ?
> 
> Do we plan to talk to 2 million unique destinations and have active
> non-default metrics for each one of them very often?
> 
> inet_peer entries will only get created when we need to make
> non-default metric settings for a specific destination address.
> 
> See that's the thing, it's scope is so much smaller than the existing
> routing cache.  It's only going to be used in limited if not
> controlled cases.
> 

OK good :)

I have no idea how many addresses have non default metric settings.

Do you know how to make an estimation on a server ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Nov. 30, 2010, 6:31 a.m.
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 30 Nov 2010 07:11:19 +0100

> Le lundi 29 novembre 2010 à 21:53 -0800, David Miller a écrit :
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Tue, 30 Nov 2010 06:42:16 +0100
>> 
>> > Its a bit early in the morning here, I must confess I dont yet
>> > understand your patch David :)
>> > 
>> > As we use a tree, why not using two different trees for ipv4 / ipv6 ?
>> 
>> The "key" just creates a natural ordering in the tree, it's
>> almost arbitrary except that it must distribute well amongst
>> the entries.
> 
> Hmm. AVL search must take a decision, take the left or the right path.
> 
> if current key is equal, which path do you take ?

Right. :-)  But yes there is some error that needs to
be handled in that equal keys can lead to imporper paths
as you showed.

I guess I do need to make a seperate tree, how disappointing.

Anyways I will work on fixing this.

> Do you know how to make an estimation on a server ?

No way to do it straightforward I guess because right now TCP metrics
key on all sorts of rediculious things like source address, TOS, and
other crap as a consequence of how the routing cache works.

So much replicated information and wasted memory.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index fe239bf..834f045 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -13,10 +13,18 @@ 
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
+typedef struct {
+	union {
+		__be32		a4;
+		__be32		a6[4];
+	};
+	__u16	family;
+} inet_peer_address_t;
+
 struct inet_peer {
 	/* group together avl_left,avl_right,v4daddr to speedup lookups */
 	struct inet_peer __rcu	*avl_left, *avl_right;
-	__be32			v4daddr;	/* peer's address */
+	inet_peer_address_t	daddr;
 	__u32			avl_height;
 	struct list_head	unused;
 	__u32			dtime;		/* the time of last use of not
@@ -42,7 +50,16 @@  struct inet_peer {
 void			inet_initpeers(void) __init;
 
 /* can be called with or without local BH being disabled */
-struct inet_peer	*inet_getpeer(__be32 daddr, int create);
+struct inet_peer	*inet_getpeer(inet_peer_address_t *daddr, int create);
+
+static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
+{
+	inet_peer_address_t daddr;
+
+	daddr.a4 = v4daddr;
+	daddr.family = AF_INET;
+	return inet_getpeer(&daddr, create);
+}
 
 /* can be called from BH context or outside */
 extern void inet_putpeer(struct inet_peer *p);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9e94d7c..39f18c5 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -19,6 +19,7 @@ 
 #include <linux/net.h>
 #include <net/ip.h>
 #include <net/inetpeer.h>
+#include <net/ipv6.h>
 
 /*
  *  Theory of operations.
@@ -152,11 +153,53 @@  static void unlink_from_unused(struct inet_peer *p)
 	}
 }
 
+static inline bool inet_peer_addr_equal(inet_peer_address_t *a, inet_peer_address_t *b)
+{
+	if (a->family == b->family) {
+		switch (a->family) {
+		case AF_INET:
+			if (a->a4 == b->a4)
+				return true;
+			break;
+		case AF_INET6:
+			if (!ipv6_addr_cmp((struct in6_addr *)a,
+					   (struct in6_addr *)b))
+				return true;
+			break;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
+static inline u32 inet_peer_key(inet_peer_address_t *a)
+{
+	u32 key;
+
+	switch (a->family) {
+	case AF_INET:
+		key = (__force __u32) a->a4;
+		break;
+	case AF_INET6:
+		key = ((__force __u32)a->a6[0] ^
+		       (__force __u32)a->a6[1] ^
+		       (__force __u32)a->a6[2] ^
+		       (__force __u32)a->a6[3]);
+		break;
+	default:
+		key = 0;
+		break;
+	}
+	return key;
+}
+
 /*
  * Called with local BH disabled and the pool lock held.
  */
 #define lookup(_daddr, _stack) 					\
 ({								\
+	u32 key = inet_peer_key(_daddr);			\
 	struct inet_peer *u;					\
 	struct inet_peer __rcu **v;				\
 								\
@@ -165,9 +208,9 @@  static void unlink_from_unused(struct inet_peer *p)
 	for (u = rcu_dereference_protected(peers.root,		\
 			lockdep_is_held(&peers.lock));		\
 	     u != peer_avl_empty; ) {				\
-		if (_daddr == u->v4daddr)			\
+		if (inet_peer_addr_equal(_daddr, &u->daddr))	\
 			break;					\
-		if ((__force __u32)_daddr < (__force __u32)u->v4daddr)	\
+		if (key < inet_peer_key(&u->daddr))		\
 			v = &u->avl_left;			\
 		else						\
 			v = &u->avl_right;			\
@@ -185,13 +228,14 @@  static void unlink_from_unused(struct inet_peer *p)
  * But every pointer we follow is guaranteed to be valid thanks to RCU.
  * We exit from this function if number of links exceeds PEER_MAXDEPTH
  */
-static struct inet_peer *lookup_rcu_bh(__be32 daddr)
+static struct inet_peer *lookup_rcu_bh(inet_peer_address_t *daddr)
 {
 	struct inet_peer *u = rcu_dereference_bh(peers.root);
+	u32 key = inet_peer_key(daddr);
 	int count = 0;
 
 	while (u != peer_avl_empty) {
-		if (daddr == u->v4daddr) {
+		if (inet_peer_addr_equal(daddr, &u->daddr)) {
 			/* Before taking a reference, check if this entry was
 			 * deleted, unlink_from_pool() sets refcnt=-1 to make
 			 * distinction between an unused entry (refcnt=0) and
@@ -201,7 +245,7 @@  static struct inet_peer *lookup_rcu_bh(__be32 daddr)
 				u = NULL;
 			return u;
 		}
-		if ((__force __u32)daddr < (__force __u32)u->v4daddr)
+		if (key < inet_peer_key(&u->daddr))
 			u = rcu_dereference_bh(u->avl_left);
 		else
 			u = rcu_dereference_bh(u->avl_right);
@@ -353,7 +397,7 @@  static void unlink_from_pool(struct inet_peer *p)
 	if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
 		struct inet_peer __rcu **stack[PEER_MAXDEPTH];
 		struct inet_peer __rcu ***stackptr, ***delp;
-		if (lookup(p->v4daddr, stack) != p)
+		if (lookup(&p->daddr, stack) != p)
 			BUG();
 		delp = stackptr - 1; /* *delp[0] == p */
 		if (p->avl_left == peer_avl_empty_rcu) {
@@ -366,7 +410,7 @@  static void unlink_from_pool(struct inet_peer *p)
 			BUG_ON(rcu_dereference_protected(*stackptr[-1],
 					lockdep_is_held(&peers.lock)) != t);
 			**--stackptr = t->avl_left;
-			/* t is removed, t->v4daddr > x->v4daddr for any
+			/* t is removed, t->daddr > x->daddr for any
 			 * x in p->avl_left subtree.
 			 * Put t in the old place of p. */
 			RCU_INIT_POINTER(*delp[0], t);
@@ -433,7 +477,7 @@  static int cleanup_once(unsigned long ttl)
 }
 
 /* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(__be32 daddr, int create)
+struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create)
 {
 	struct inet_peer *p;
 	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
@@ -467,10 +511,12 @@  struct inet_peer *inet_getpeer(__be32 daddr, int create)
 	}
 	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
 	if (p) {
-		p->v4daddr = daddr;
+		p->daddr = *daddr;
 		atomic_set(&p->refcnt, 1);
 		atomic_set(&p->rid, 0);
-		atomic_set(&p->ip_id_count, secure_ip_id(daddr));
+		atomic_set(&p->ip_id_count,
+			   (daddr->family == AF_INET) ?
+			   secure_ip_id(daddr->a4) : 0);
 		p->tcp_ts_stamp = 0;
 		INIT_LIST_HEAD(&p->unused);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1684408..e6215bd 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -141,7 +141,7 @@  static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
 	qp->peer = sysctl_ipfrag_max_dist ?
-		inet_getpeer(arg->iph->saddr, 1) : NULL;
+		inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
 }
 
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ec2333f..3843c2d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1289,7 +1289,7 @@  void rt_bind_peer(struct rtable *rt, int create)
 {
 	struct inet_peer *peer;
 
-	peer = inet_getpeer(rt->rt_dst, create);
+	peer = inet_getpeer_v4(rt->rt_dst, create);
 
 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 69ccbc1..00285fc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1347,7 +1347,7 @@  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		    tcp_death_row.sysctl_tw_recycle &&
 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
-		    peer->v4daddr == saddr) {
+		    peer->daddr.a4 == saddr) {
 			inet_peer_refcheck(peer);
 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
 			    (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1778,7 +1778,7 @@  int tcp_v4_remember_stamp(struct sock *sk)
 	int release_it = 0;
 
 	if (!rt || rt->rt_dst != inet->inet_daddr) {
-		peer = inet_getpeer(inet->inet_daddr, 1);
+		peer = inet_getpeer_v4(inet->inet_daddr, 1);
 		release_it = 1;
 	} else {
 		if (!rt->peer)
@@ -1804,7 +1804,7 @@  EXPORT_SYMBOL(tcp_v4_remember_stamp);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
-	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
+	struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1);
 
 	if (peer) {
 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);