diff mbox

[RFC,net-next] ipv6: Use destination address determined by IPVS

Message ID 1381881751-6719-1-git-send-email-horms@verge.net.au
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Simon Horman Oct. 16, 2013, 12:02 a.m. UTC
In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
such that it creates and uses a neigh entry if none is found.
Subsequently the 'n' field was removed from struct rt6_info.

Unfortunately my analysis is that in the case of IPVS direct routing this
change leads to incorrect behaviour as in this case packets may be output
to a destination other than where they would be output according to the
route table. In particular, the destination address may actually be a local
address and empirically a neighbour lookup seems to result in it becoming
unreachable.

This patch resolves the problem by providing the destination address
determined by IPVS to ip6_finish_output2() in the skb callback.  Although
this seems to work I can see several problems with this approach:

* It is rather ugly, stuffing an IPVS exception right in
  the middle of IPv6 code. The overhead could be eliminated for many users
  by using a staic key. But none the less it is not attractive.

* The use of the skb callback is may not be valid
  as it crosses from IPVS to IPv6 code. A possible, though unpleasant,
  alternative is to add a new field to struct sk_buff.

* This covers all IPv6 packets output by IPVS but actually
  only those output using IPVS Direct-Routing need this.  One way to
  resolve this would be to add a more fine-grained ipvs_property to
  struct sk_buff.

Reported-by: Mark Brooks <mark@loadbalancer.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 6 ++++++
 net/ipv6/ip6_output.c           | 9 +++++++--
 net/netfilter/ipvs/ip_vs_xmit.c | 2 ++
 3 files changed, 15 insertions(+), 2 deletions(-)

Comments

Eric Dumazet Oct. 16, 2013, 12:13 a.m. UTC | #1
On Wed, 2013-10-16 at 09:02 +0900, Simon Horman wrote:
> In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> such that it creates and uses a neigh entry if none is found.
> Subsequently the 'n' field was removed from struct rt6_info.
> 
> Unfortunately my analysis is that in the case of IPVS direct routing this
> change leads to incorrect behaviour as in this case packets may be output
> to a destination other than where they would be output according to the
> route table. In particular, the destination address may actually be a local
> address and empirically a neighbour lookup seems to result in it becoming
> unreachable.
> 
> This patch resolves the problem by providing the destination address
> determined by IPVS to ip6_finish_output2() in the skb callback.  Although
> this seems to work I can see several problems with this approach:
> 
> * It is rather ugly, stuffing an IPVS exception right in
>   the middle of IPv6 code. The overhead could be eliminated for many users
>   by using a staic key. But none the less it is not attractive.
> 
> * The use of the skb callback is may not be valid
>   as it crosses from IPVS to IPv6 code. A possible, though unpleasant,
>   alternative is to add a new field to struct sk_buff.

Please no ;)

> 
> * This covers all IPv6 packets output by IPVS but actually
>   only those output using IPVS Direct-Routing need this.  One way to
>   resolve this would be to add a more fine-grained ipvs_property to
>   struct sk_buff.
> 
> Reported-by: Mark Brooks <mark@loadbalancer.org>
> Signed-off-by: Simon Horman <horms@verge.net.au>
> ---
>  include/net/ip_vs.h             | 6 ++++++
>  net/ipv6/ip6_output.c           | 9 +++++++--
>  net/netfilter/ipvs/ip_vs_xmit.c | 2 ++
>  3 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index 1c2e1b9..11d90a6 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -1649,4 +1649,10 @@ ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
>  		atomic_read(&dest->inactconns);
>  }
>  
> +struct ipvs_skb_cb {
> +	struct in6_addr *daddr;
> +};

So we pass a reference.

> +
> +#define IP_VS_SKB_CB(skb) ((struct ipvs_skb_cb *)&(skb)->cb)
> +
>  #endif	/* _NET_IP_VS_H */
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index a54c45c..a340180 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -52,6 +52,7 @@
>  #include <net/addrconf.h>
>  #include <net/rawv6.h>
>  #include <net/icmp.h>
> +#include <net/ip_vs.h>
>  #include <net/xfrm.h>
>  #include <net/checksum.h>
>  #include <linux/mroute6.h>
> @@ -61,7 +62,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
>  	struct dst_entry *dst = skb_dst(skb);
>  	struct net_device *dev = dst->dev;
>  	struct neighbour *neigh;
> -	struct in6_addr *nexthop;
> +	struct in6_addr *nexthop, *daddr;
>  	int ret;
>  
>  	skb->protocol = htons(ETH_P_IPV6);
> @@ -105,7 +106,11 @@ static int ip6_finish_output2(struct sk_buff *skb)
>  	}
>  
>  	rcu_read_lock_bh();
> -	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
> +	if (unlikely(IS_ENABLED(CONFIG_IP_VS) && skb->ipvs_property))
> +		daddr = IP_VS_SKB_CB(skb)->daddr;

What guarantee do we have daddr points to valid memory (not already
freed/reused) ?

I guess things like NFQUEUE could happen ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Horman Oct. 16, 2013, 12:28 a.m. UTC | #2
On Tue, Oct 15, 2013 at 05:13:46PM -0700, Eric Dumazet wrote:
> On Wed, 2013-10-16 at 09:02 +0900, Simon Horman wrote:
> > In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> > ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> > such that it creates and uses a neigh entry if none is found.
> > Subsequently the 'n' field was removed from struct rt6_info.
> > 
> > Unfortunately my analysis is that in the case of IPVS direct routing this
> > change leads to incorrect behaviour as in this case packets may be output
> > to a destination other than where they would be output according to the
> > route table. In particular, the destination address may actually be a local
> > address and empirically a neighbour lookup seems to result in it becoming
> > unreachable.
> > 
> > This patch resolves the problem by providing the destination address
> > determined by IPVS to ip6_finish_output2() in the skb callback.  Although
> > this seems to work I can see several problems with this approach:
> > 
> > * It is rather ugly, stuffing an IPVS exception right in
> >   the middle of IPv6 code. The overhead could be eliminated for many users
> >   by using a staic key. But none the less it is not attractive.
> > 
> > * The use of the skb callback is may not be valid
> >   as it crosses from IPVS to IPv6 code. A possible, though unpleasant,
> >   alternative is to add a new field to struct sk_buff.
> 
> Please no ;)

I thought of you when I wrote that comment :)

> > * This covers all IPv6 packets output by IPVS but actually
> >   only those output using IPVS Direct-Routing need this.  One way to
> >   resolve this would be to add a more fine-grained ipvs_property to
> >   struct sk_buff.
> > 
> > Reported-by: Mark Brooks <mark@loadbalancer.org>
> > Signed-off-by: Simon Horman <horms@verge.net.au>
> > ---
> >  include/net/ip_vs.h             | 6 ++++++
> >  net/ipv6/ip6_output.c           | 9 +++++++--
> >  net/netfilter/ipvs/ip_vs_xmit.c | 2 ++
> >  3 files changed, 15 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > index 1c2e1b9..11d90a6 100644
> > --- a/include/net/ip_vs.h
> > +++ b/include/net/ip_vs.h
> > @@ -1649,4 +1649,10 @@ ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
> >  		atomic_read(&dest->inactconns);
> >  }
> >  
> > +struct ipvs_skb_cb {
> > +	struct in6_addr *daddr;
> > +};
> 
> So we pass a reference.
>
> > +
> > +#define IP_VS_SKB_CB(skb) ((struct ipvs_skb_cb *)&(skb)->cb)
> > +
> >  #endif	/* _NET_IP_VS_H */
> > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> > index a54c45c..a340180 100644
> > --- a/net/ipv6/ip6_output.c
> > +++ b/net/ipv6/ip6_output.c
> > @@ -52,6 +52,7 @@
> >  #include <net/addrconf.h>
> >  #include <net/rawv6.h>
> >  #include <net/icmp.h>
> > +#include <net/ip_vs.h>
> >  #include <net/xfrm.h>
> >  #include <net/checksum.h>
> >  #include <linux/mroute6.h>
> > @@ -61,7 +62,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
> >  	struct dst_entry *dst = skb_dst(skb);
> >  	struct net_device *dev = dst->dev;
> >  	struct neighbour *neigh;
> > -	struct in6_addr *nexthop;
> > +	struct in6_addr *nexthop, *daddr;
> >  	int ret;
> >  
> >  	skb->protocol = htons(ETH_P_IPV6);
> > @@ -105,7 +106,11 @@ static int ip6_finish_output2(struct sk_buff *skb)
> >  	}
> >  
> >  	rcu_read_lock_bh();
> > -	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
> > +	if (unlikely(IS_ENABLED(CONFIG_IP_VS) && skb->ipvs_property))
> > +		daddr = IP_VS_SKB_CB(skb)->daddr;
> 
> What guarantee do we have daddr points to valid memory (not already
> freed/reused) ?

I can change that to passing a value if there is a risk
the reference could become invalid. To be honest I am more
worried that skb->cb might be clobbered entirely.

> I guess things like NFQUEUE could happen ?

Could you expand a little?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 16, 2013, 12:39 a.m. UTC | #3
On Wed, 2013-10-16 at 09:28 +0900, Simon Horman wrote:

> > I guess things like NFQUEUE could happen ?
> 
> Could you expand a little?

This was to point that between IPVS and ipv6 stack we might have a
delay, and daddr was maybe pointed to a freed memory.

IP6CB only uses 24 bytes, so I think you would be safe adding 16 bytes.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Oct. 16, 2013, 12:53 a.m. UTC | #4
On Wed, Oct 16, 2013 at 09:02:31AM +0900, Simon Horman wrote:
> In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> such that it creates and uses a neigh entry if none is found.
> Subsequently the 'n' field was removed from struct rt6_info.
> 
> Unfortunately my analysis is that in the case of IPVS direct routing this
> change leads to incorrect behaviour as in this case packets may be output
> to a destination other than where they would be output according to the
> route table. In particular, the destination address may actually be a local
> address and empirically a neighbour lookup seems to result in it becoming
> unreachable.
> 
> This patch resolves the problem by providing the destination address
> determined by IPVS to ip6_finish_output2() in the skb callback.  Although
> this seems to work I can see several problems with this approach:
> 
> * It is rather ugly, stuffing an IPVS exception right in
>   the middle of IPv6 code. The overhead could be eliminated for many users
>   by using a staic key. But none the less it is not attractive.
> 
> * The use of the skb callback is may not be valid
>   as it crosses from IPVS to IPv6 code. A possible, though unpleasant,
>   alternative is to add a new field to struct sk_buff.
> 
> * This covers all IPv6 packets output by IPVS but actually
>   only those output using IPVS Direct-Routing need this.  One way to
>   resolve this would be to add a more fine-grained ipvs_property to
>   struct sk_buff.

Hmm, that reminds me on the following bug report which would be nice we could
solve in one go, too: http://www.spinics.net/lists/netdev/msg250785.html

Greetings,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Horman Oct. 16, 2013, 2:13 a.m. UTC | #5
On Tue, Oct 15, 2013 at 05:39:09PM -0700, Eric Dumazet wrote:
> On Wed, 2013-10-16 at 09:28 +0900, Simon Horman wrote:
> 
> > > I guess things like NFQUEUE could happen ?
> > 
> > Could you expand a little?
> 
> This was to point that between IPVS and ipv6 stack we might have a
> delay, and daddr was maybe pointed to a freed memory.
> 
> IP6CB only uses 24 bytes, so I think you would be safe adding 16 bytes.

That does seem very promising but while implementing it
I hit a problem.

struct tcp_skb_cb includes a field of type struct inet6_skb_parm.  And
expanding struct inet6_skb_parm by 16 bytes means that struct tcp_skb_cb is
now larger than 48 bytes and no longer fits in skb->cb.

Is it appropriate to grow skb->cb as the comment above struct tcp_skb_cb
suggests?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Horman Oct. 16, 2013, 2:14 a.m. UTC | #6
On Wed, Oct 16, 2013 at 02:53:22AM +0200, Hannes Frederic Sowa wrote:
> On Wed, Oct 16, 2013 at 09:02:31AM +0900, Simon Horman wrote:
> > In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> > ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> > such that it creates and uses a neigh entry if none is found.
> > Subsequently the 'n' field was removed from struct rt6_info.
> > 
> > Unfortunately my analysis is that in the case of IPVS direct routing this
> > change leads to incorrect behaviour as in this case packets may be output
> > to a destination other than where they would be output according to the
> > route table. In particular, the destination address may actually be a local
> > address and empirically a neighbour lookup seems to result in it becoming
> > unreachable.
> > 
> > This patch resolves the problem by providing the destination address
> > determined by IPVS to ip6_finish_output2() in the skb callback.  Although
> > this seems to work I can see several problems with this approach:
> > 
> > * It is rather ugly, stuffing an IPVS exception right in
> >   the middle of IPv6 code. The overhead could be eliminated for many users
> >   by using a staic key. But none the less it is not attractive.
> > 
> > * The use of the skb callback is may not be valid
> >   as it crosses from IPVS to IPv6 code. A possible, though unpleasant,
> >   alternative is to add a new field to struct sk_buff.
> > 
> > * This covers all IPv6 packets output by IPVS but actually
> >   only those output using IPVS Direct-Routing need this.  One way to
> >   resolve this would be to add a more fine-grained ipvs_property to
> >   struct sk_buff.
> 
> Hmm, that reminds me on the following bug report which would be nice we could
> solve in one go, too: http://www.spinics.net/lists/netdev/msg250785.html

I think it should be possible to solve that using the
IP6CB() approach that Eric suggested. Hopefully we can make
that approach fly.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 16, 2013, 2:40 a.m. UTC | #7
On Wed, 2013-10-16 at 11:13 +0900, Simon Horman wrote:

> That does seem very promising but while implementing it
> I hit a problem.
> 
> struct tcp_skb_cb includes a field of type struct inet6_skb_parm.  And
> expanding struct inet6_skb_parm by 16 bytes means that struct tcp_skb_cb is
> now larger than 48 bytes and no longer fits in skb->cb.
> 
> Is it appropriate to grow skb->cb as the comment above struct tcp_skb_cb
> suggests?

Ah well...

No, its not appropriate to grow sk_buff by 16 bytes, sorry.

You could not change struct inet6_skb_parm, but define IP6CB as
a compound 

struct ip6cb {
   struct inet6_skb_parm foo;
   struct in6_addr       bar;
};



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Horman Oct. 16, 2013, 4:26 a.m. UTC | #8
On Tue, Oct 15, 2013 at 07:40:59PM -0700, Eric Dumazet wrote:
> On Wed, 2013-10-16 at 11:13 +0900, Simon Horman wrote:
> 
> > That does seem very promising but while implementing it
> > I hit a problem.
> > 
> > struct tcp_skb_cb includes a field of type struct inet6_skb_parm.  And
> > expanding struct inet6_skb_parm by 16 bytes means that struct tcp_skb_cb is
> > now larger than 48 bytes and no longer fits in skb->cb.
> > 
> > Is it appropriate to grow skb->cb as the comment above struct tcp_skb_cb
> > suggests?
> 
> Ah well...
> 
> No, its not appropriate to grow sk_buff by 16 bytes, sorry.
> 
> You could not change struct inet6_skb_parm, but define IP6CB as
> a compound 
> 
> struct ip6cb {
>    struct inet6_skb_parm foo;
>    struct in6_addr       bar;
> };

Thanks for your guidance, that possibility had occured to me too.
I'll rework the patch accordingly.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Julian Anastasov Oct. 16, 2013, 7:27 a.m. UTC | #9
Hello,

On Wed, 16 Oct 2013, Simon Horman wrote:

> In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> such that it creates and uses a neigh entry if none is found.
> Subsequently the 'n' field was removed from struct rt6_info.

	Similar change in IPv4 opened the Pandora box:
IPVS, xt_TEE, raw.c (IP_HDRINCL). May be the corrsponding
places in IPv6 have the same problem.

	I don't know the IPv6 routing but if we find a way
to keep the desired nexthop in rt6i_gateway and to add
RTF_GATEWAY checks here and there such solution would be more
general. FLOWI_FLAG_KNOWN_NH flag can help, if needed.

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Oct. 16, 2013, 2:32 p.m. UTC | #10
On Wed, Oct 16, 2013 at 10:27:47AM +0300, Julian Anastasov wrote:
> 
> 	Hello,
> 
> On Wed, 16 Oct 2013, Simon Horman wrote:
> 
> > In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in
> > ip6_finish_output2()") changed the behaviour of ip6_finish_output2()
> > such that it creates and uses a neigh entry if none is found.
> > Subsequently the 'n' field was removed from struct rt6_info.
> 
> 	Similar change in IPv4 opened the Pandora box:
> IPVS, xt_TEE, raw.c (IP_HDRINCL). May be the corrsponding
> places in IPv6 have the same problem.
> 
> 	I don't know the IPv6 routing but if we find a way
> to keep the desired nexthop in rt6i_gateway and to add
> RTF_GATEWAY checks here and there such solution would be more
> general. FLOWI_FLAG_KNOWN_NH flag can help, if needed.

I thought about this yesterday but did not see an easy way. How does the IPv4
implementation accomplish this?

ipvs caches the dst in its own infrastructure, so we need to be sure we don't
disconnect this dst from the ipv6 routing table, otherwise ip6_dst_check won't
recognize when relookups should be done. Playing games with RTF_GATEWAY seems
dangerous then.

I am currently thinking about using a new flag to replace the nexthop
information with rt6i_dst in these circumstances. The flag would have
to be included in the skb somewhere.

Greetings,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Julian Anastasov Oct. 16, 2013, 8:22 p.m. UTC | #11
Hello,

On Wed, 16 Oct 2013, Hannes Frederic Sowa wrote:

> On Wed, Oct 16, 2013 at 10:27:47AM +0300, Julian Anastasov wrote:
> > 
> > 	I don't know the IPv6 routing but if we find a way
> > to keep the desired nexthop in rt6i_gateway and to add
> > RTF_GATEWAY checks here and there such solution would be more
> > general. FLOWI_FLAG_KNOWN_NH flag can help, if needed.
> 
> I thought about this yesterday but did not see an easy way. How does the IPv4
> implementation accomplish this?

	In IPv4 rt->rt_flags has no bit to indicate if the route
is via gateway (like RTF_GATEWAY in IPv6). We added rt_uses_gateway
for this purpose.

	In the default case, rt_gateway may contain 0 if we return
cached result, eg. when target is part of a local subnet.
Then IPVS/TEE/RAW can request valid rt_gateway, even with the price
of a cloned result, so that rt_gateway can remember the requested
nexthop which may differ from daddr.

> ipvs caches the dst in its own infrastructure, so we need to be sure we don't
> disconnect this dst from the ipv6 routing table, otherwise ip6_dst_check won't
> recognize when relookups should be done. Playing games with RTF_GATEWAY seems
> dangerous then.

	dst_check works for IPVS. There is a problem only
with the recent changes that moved the indication for PMTU
change from dst_check to dst_mtu() calls. But this is safe 
for IPVS, it handles FRAG_NEEDED for the tunneling mode itself.

	Initially, I thought IPv6 stores zeroes in rt6i_gateway.
But now I see rt6_alloc_cow() to be called for the case I assumed
to fail - when no gateway is used.

	So, I'll try to test the IPVS case in the following 1-2 days
and will report after adding some printks. If xt_TEE has
the same problem then it should not be IPVS-specific. RAW not
tested yet.

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Oct. 16, 2013, 9:59 p.m. UTC | #12
On Wed, Oct 16, 2013 at 11:22:40PM +0300, Julian Anastasov wrote:
> 
> 	Hello,
> 
> On Wed, 16 Oct 2013, Hannes Frederic Sowa wrote:
> 
> > On Wed, Oct 16, 2013 at 10:27:47AM +0300, Julian Anastasov wrote:
> > > 
> > > 	I don't know the IPv6 routing but if we find a way
> > > to keep the desired nexthop in rt6i_gateway and to add
> > > RTF_GATEWAY checks here and there such solution would be more
> > > general. FLOWI_FLAG_KNOWN_NH flag can help, if needed.
> > 
> > I thought about this yesterday but did not see an easy way. How does the IPv4
> > implementation accomplish this?
> 
> 	In IPv4 rt->rt_flags has no bit to indicate if the route
> is via gateway (like RTF_GATEWAY in IPv6). We added rt_uses_gateway
> for this purpose.
> 
> 	In the default case, rt_gateway may contain 0 if we return
> cached result, eg. when target is part of a local subnet.
> Then IPVS/TEE/RAW can request valid rt_gateway, even with the price
> of a cloned result, so that rt_gateway can remember the requested
> nexthop which may differ from daddr.

To have ip6_dst_check working, there must to be a valid link from
the rt6_info to the fib6_node. Otherwise we cannot check the serial
number. As I currently see we also need a link from the fib6_node down
to the dst entry for resource management. Thus we would have to insert
the special dst-entry with RTF_GATEWAY and non-null rt6i_gateway back
into the fib and have it globally visible. This could have unforseen
side effects. We still cache all dst entries in the fib. One think I
foresee as a possible problem is the automatic aggregation of ECMP routes,
too.

IPv4 does not seem to need this link at all.

> > ipvs caches the dst in its own infrastructure, so we need to be sure we don't
> > disconnect this dst from the ipv6 routing table, otherwise ip6_dst_check won't
> > recognize when relookups should be done. Playing games with RTF_GATEWAY seems
> > dangerous then.
> 
> 	dst_check works for IPVS. There is a problem only
> with the recent changes that moved the indication for PMTU
> change from dst_check to dst_mtu() calls. But this is safe 
> for IPVS, it handles FRAG_NEEDED for the tunneling mode itself.

Ok, I see.

> 	Initially, I thought IPv6 stores zeroes in rt6i_gateway.
> But now I see rt6_alloc_cow() to be called for the case I assumed
> to fail - when no gateway is used.
> 
> 	So, I'll try to test the IPVS case in the following 1-2 days
> and will report after adding some printks. If xt_TEE has
> the same problem then it should not be IPVS-specific. RAW not
> tested yet.

We should provide something similar to what IPv4 does with the
KNOWN_NH flag. I guess my idea with exchanging rt6i_dst as nexthop would
solve this without too much hassle but this would have to be checked by
implementing it.

I don't think that storing a changed nexthop in the ipv6 cb is that nice and
maintainable.

Greetings,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Julian Anastasov Oct. 16, 2013, 10:50 p.m. UTC | #13
Hello,

On Wed, 16 Oct 2013, Hannes Frederic Sowa wrote:

> To have ip6_dst_check working, there must to be a valid link from
> the rt6_info to the fib6_node. Otherwise we cannot check the serial
> number. As I currently see we also need a link from the fib6_node down
> to the dst entry for resource management. Thus we would have to insert
> the special dst-entry with RTF_GATEWAY and non-null rt6i_gateway back
> into the fib and have it globally visible. This could have unforseen
> side effects. We still cache all dst entries in the fib. One think I
> foresee as a possible problem is the automatic aggregation of ECMP routes,
> too.

	I'm still not sure what is needed. Looking at
ip6_pol_route(), I see that everything should just work: for
routes without RTF_GATEWAY flag we return cloned route
from rt6_alloc_cow() with valid rt6i_gateway. I still
didn't tested the problem myself, so I'll stop with the
comments before that.

> We should provide something similar to what IPv4 does with the
> KNOWN_NH flag. I guess my idea with exchanging rt6i_dst as nexthop would
> solve this without too much hassle but this would have to be checked by
> implementing it.
> 
> I don't think that storing a changed nexthop in the ipv6 cb is that nice and
> maintainable.

	Agreed. I'm not going to test that change. I'll test
latest net-next without any changes to see where exactly is
the failure because the IPv6 routing seems capable for what
we need. I have problems with my setup, so I'll need some
days. As Simon is also testing the problem, he can find the
reason before me.

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 1c2e1b9..11d90a6 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1649,4 +1649,10 @@  ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
 		atomic_read(&dest->inactconns);
 }
 
+struct ipvs_skb_cb {
+	struct in6_addr *daddr;
+};
+
+#define IP_VS_SKB_CB(skb) ((struct ipvs_skb_cb *)&(skb)->cb)
+
 #endif	/* _NET_IP_VS_H */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a54c45c..a340180 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -52,6 +52,7 @@ 
 #include <net/addrconf.h>
 #include <net/rawv6.h>
 #include <net/icmp.h>
+#include <net/ip_vs.h>
 #include <net/xfrm.h>
 #include <net/checksum.h>
 #include <linux/mroute6.h>
@@ -61,7 +62,7 @@  static int ip6_finish_output2(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
 	struct neighbour *neigh;
-	struct in6_addr *nexthop;
+	struct in6_addr *nexthop, *daddr;
 	int ret;
 
 	skb->protocol = htons(ETH_P_IPV6);
@@ -105,7 +106,11 @@  static int ip6_finish_output2(struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+	if (unlikely(IS_ENABLED(CONFIG_IP_VS) && skb->ipvs_property))
+		daddr = IP_VS_SKB_CB(skb)->daddr;
+	else
+		daddr = &ipv6_hdr(skb)->daddr;
+	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index c47444e..054b679 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -391,6 +391,8 @@  __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 		rt = (struct rt6_info *) dst;
 	}
 
+	IP_VS_SKB_CB(skb)->daddr = daddr;
+
 	local = __ip_vs_is_local_route6(rt);
 	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
 	      rt_mode)) {