diff mbox series

[v2] netfilter: nf_conntrack: resolve clash for matching conntracks

Message ID 20180702145214.14925-1-martynas@weave.works
State Accepted
Delegated to: Pablo Neira
Headers show
Series [v2] netfilter: nf_conntrack: resolve clash for matching conntracks | expand

Commit Message

Martynas Pumputis July 2, 2018, 2:52 p.m. UTC
This patch enables the clash resolution for NAT (disabled in
"590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
and a protocol allows it.

The clash might happen for a connections-less protocol (e.g. UDP) when
two threads in parallel writes to the same socket and consequent calls
to "get_unique_tuple" return the same tuples (incl. reply tuples).

In this case it is safe to perform the resolution, as the losing CT
describes the same mangling as the winning CT, so no modifications to
the packet are needed, and the result of rules traversal for the loser's
packet stays valid.

Signed-off-by: Martynas Pumputis <martynas@weave.works>
---
 net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

Comments

Pablo Neira Ayuso July 4, 2018, 5:14 p.m. UTC | #1
On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
> This patch enables the clash resolution for NAT (disabled in
> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
> and a protocol allows it.
> 
> The clash might happen for a connections-less protocol (e.g. UDP) when
> two threads in parallel writes to the same socket and consequent calls
> to "get_unique_tuple" return the same tuples (incl. reply tuples).
> 
> In this case it is safe to perform the resolution, as the losing CT
> describes the same mangling as the winning CT, so no modifications to
> the packet are needed, and the result of rules traversal for the loser's
> packet stays valid.
> 
> Signed-off-by: Martynas Pumputis <martynas@weave.works>
> ---
>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>  1 file changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 3465da2a98bd..cc5ef8c9d0da 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>  	       net_eq(net, nf_ct_net(ct));
>  }
>  
> +static inline bool
> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
> +{
> +	return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> +				 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
> +	       nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
> +				 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
> +	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
> +	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
> +	       net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
> +}
> +
>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>  static void nf_ct_gc_expired(struct nf_conn *ct)
>  {
> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>  	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>  	const struct nf_conntrack_l4proto *l4proto;
>  
> +	enum ip_conntrack_info oldinfo;
> +	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> +
>  	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>  	if (l4proto->allow_clash &&
> -	    ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>  	    !nf_ct_is_dying(ct) &&
>  	    atomic_inc_not_zero(&ct->ct_general.use)) {
> -		enum ip_conntrack_info oldinfo;
> -		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> -
> -		nf_ct_acct_merge(ct, ctinfo, loser_ct);
> -		nf_conntrack_put(&loser_ct->ct_general);
> -		nf_ct_set(skb, ct, oldinfo);
> -		return NF_ACCEPT;
> +		if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
> +		    nf_ct_match(ct, loser_ct)) {
> +			nf_ct_acct_merge(ct, ctinfo, loser_ct);
> +			nf_conntrack_put(&loser_ct->ct_general);
> +			nf_ct_set(skb, ct, oldinfo);
> +			return NF_ACCEPT;

This will not work because the packet losing race will get mangled by
when NAT is performed.

Did you test this? Or this is some sort of RFC?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 7, 2018, 10:44 a.m. UTC | #2
On 4 July 2018 at 19:14, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
>> This patch enables the clash resolution for NAT (disabled in
>> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
>> and a protocol allows it.
>>
>> The clash might happen for a connections-less protocol (e.g. UDP) when
>> two threads in parallel writes to the same socket and consequent calls
>> to "get_unique_tuple" return the same tuples (incl. reply tuples).
>>
>> In this case it is safe to perform the resolution, as the losing CT
>> describes the same mangling as the winning CT, so no modifications to
>> the packet are needed, and the result of rules traversal for the loser's
>> packet stays valid.
>>
>> Signed-off-by: Martynas Pumputis <martynas@weave.works>
>> ---
>>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>>  1 file changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
>> index 3465da2a98bd..cc5ef8c9d0da 100644
>> --- a/net/netfilter/nf_conntrack_core.c
>> +++ b/net/netfilter/nf_conntrack_core.c
>> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>>              net_eq(net, nf_ct_net(ct));
>>  }
>>
>> +static inline bool
>> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
>> +{
>> +     return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
>> +                              &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
>> +            nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
>> +                              &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
>> +            net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
>> +}
>> +
>>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>>  static void nf_ct_gc_expired(struct nf_conn *ct)
>>  {
>> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>>       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>       const struct nf_conntrack_l4proto *l4proto;
>>
>> +     enum ip_conntrack_info oldinfo;
>> +     struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> +
>>       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>>       if (l4proto->allow_clash &&
>> -         ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>>           !nf_ct_is_dying(ct) &&
>>           atomic_inc_not_zero(&ct->ct_general.use)) {
>> -             enum ip_conntrack_info oldinfo;
>> -             struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> -
>> -             nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> -             nf_conntrack_put(&loser_ct->ct_general);
>> -             nf_ct_set(skb, ct, oldinfo);
>> -             return NF_ACCEPT;
>> +             if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
>> +                 nf_ct_match(ct, loser_ct)) {
>> +                     nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> +                     nf_conntrack_put(&loser_ct->ct_general);
>> +                     nf_ct_set(skb, ct, oldinfo);
>> +                     return NF_ACCEPT;
>
> This will not work because the packet losing race will get mangled by
> when NAT is performed.

The idea of this patch is to resolve the conflict only among packets
with the same mangling (= with matching tuples). The mangling happens
before the confirmation. Please correct me if I'm missing something.

>
> Did you test this? Or this is some sort of RFC?

I've tested it and put the steps for reproducing the issue and logs
for different configurations at https://github.com/brb/conntrack-race.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 9, 2018, 6:12 p.m. UTC | #3
On Sat, Jul 07, 2018 at 12:44:10PM +0200, Martynas Pumputis wrote:
> On 4 July 2018 at 19:14, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
> >> This patch enables the clash resolution for NAT (disabled in
> >> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
> >> and a protocol allows it.
> >>
> >> The clash might happen for a connections-less protocol (e.g. UDP) when
> >> two threads in parallel writes to the same socket and consequent calls
> >> to "get_unique_tuple" return the same tuples (incl. reply tuples).
> >>
> >> In this case it is safe to perform the resolution, as the losing CT
> >> describes the same mangling as the winning CT, so no modifications to
> >> the packet are needed, and the result of rules traversal for the loser's
> >> packet stays valid.
> >>
> >> Signed-off-by: Martynas Pumputis <martynas@weave.works>
> >> ---
> >>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
> >>  1 file changed, 23 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> >> index 3465da2a98bd..cc5ef8c9d0da 100644
> >> --- a/net/netfilter/nf_conntrack_core.c
> >> +++ b/net/netfilter/nf_conntrack_core.c
> >> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
> >>              net_eq(net, nf_ct_net(ct));
> >>  }
> >>
> >> +static inline bool
> >> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
> >> +{
> >> +     return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> >> +                              &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
> >> +            nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
> >> +                              &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
> >> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
> >> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
> >> +            net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
> >> +}
> >> +
> >>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
> >>  static void nf_ct_gc_expired(struct nf_conn *ct)
> >>  {
> >> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
> >>       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> >>       const struct nf_conntrack_l4proto *l4proto;
> >>
> >> +     enum ip_conntrack_info oldinfo;
> >> +     struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> >> +
> >>       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
> >>       if (l4proto->allow_clash &&
> >> -         ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
> >>           !nf_ct_is_dying(ct) &&
> >>           atomic_inc_not_zero(&ct->ct_general.use)) {
> >> -             enum ip_conntrack_info oldinfo;
> >> -             struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> >> -
> >> -             nf_ct_acct_merge(ct, ctinfo, loser_ct);
> >> -             nf_conntrack_put(&loser_ct->ct_general);
> >> -             nf_ct_set(skb, ct, oldinfo);
> >> -             return NF_ACCEPT;
> >> +             if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
> >> +                 nf_ct_match(ct, loser_ct)) {
> >> +                     nf_ct_acct_merge(ct, ctinfo, loser_ct);
> >> +                     nf_conntrack_put(&loser_ct->ct_general);
> >> +                     nf_ct_set(skb, ct, oldinfo);
> >> +                     return NF_ACCEPT;
> >
> > This will not work because the packet losing race will get mangled by
> > when NAT is performed.
> 
> The idea of this patch is to resolve the conflict only among packets
> with the same mangling (= with matching tuples). The mangling happens
> before the confirmation. Please correct me if I'm missing something.

Yes, the mangling happens before confirmation, but will mangle the
source port to make sure two makes don't get the same tuple. Hence,
the two packets will get different source port numbers, that's why we
need the code to undo the NAT mangling in
368982cd7d1bd41cd39049c794990aca3770db44.

Have a look at tcpdump to make sure packets are mangled consistently,
not just going through / not dropped.

> > Did you test this? Or this is some sort of RFC?
> 
> I've tested it and put the steps for reproducing the issue and logs
> for different configurations at https://github.com/brb/conntrack-race.

OK, are you watching for packets drops to make sure your fix work, I
mean, how are you doing the validation?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 9, 2018, 6:42 p.m. UTC | #4
On 9 July 2018 at 20:12, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Sat, Jul 07, 2018 at 12:44:10PM +0200, Martynas Pumputis wrote:
>> On 4 July 2018 at 19:14, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> > On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
>> >> This patch enables the clash resolution for NAT (disabled in
>> >> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
>> >> and a protocol allows it.
>> >>
>> >> The clash might happen for a connections-less protocol (e.g. UDP) when
>> >> two threads in parallel writes to the same socket and consequent calls
>> >> to "get_unique_tuple" return the same tuples (incl. reply tuples).
>> >>
>> >> In this case it is safe to perform the resolution, as the losing CT
>> >> describes the same mangling as the winning CT, so no modifications to
>> >> the packet are needed, and the result of rules traversal for the loser's
>> >> packet stays valid.
>> >>
>> >> Signed-off-by: Martynas Pumputis <martynas@weave.works>
>> >> ---
>> >>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>> >>  1 file changed, 23 insertions(+), 8 deletions(-)
>> >>
>> >> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
>> >> index 3465da2a98bd..cc5ef8c9d0da 100644
>> >> --- a/net/netfilter/nf_conntrack_core.c
>> >> +++ b/net/netfilter/nf_conntrack_core.c
>> >> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>> >>              net_eq(net, nf_ct_net(ct));
>> >>  }
>> >>
>> >> +static inline bool
>> >> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
>> >> +{
>> >> +     return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
>> >> +                              &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
>> >> +            nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
>> >> +                              &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
>> >> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
>> >> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
>> >> +            net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
>> >> +}
>> >> +
>> >>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>> >>  static void nf_ct_gc_expired(struct nf_conn *ct)
>> >>  {
>> >> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>> >>       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>> >>       const struct nf_conntrack_l4proto *l4proto;
>> >>
>> >> +     enum ip_conntrack_info oldinfo;
>> >> +     struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> >> +
>> >>       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>> >>       if (l4proto->allow_clash &&
>> >> -         ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>> >>           !nf_ct_is_dying(ct) &&
>> >>           atomic_inc_not_zero(&ct->ct_general.use)) {
>> >> -             enum ip_conntrack_info oldinfo;
>> >> -             struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> >> -
>> >> -             nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> >> -             nf_conntrack_put(&loser_ct->ct_general);
>> >> -             nf_ct_set(skb, ct, oldinfo);
>> >> -             return NF_ACCEPT;
>> >> +             if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
>> >> +                 nf_ct_match(ct, loser_ct)) {
>> >> +                     nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> >> +                     nf_conntrack_put(&loser_ct->ct_general);
>> >> +                     nf_ct_set(skb, ct, oldinfo);
>> >> +                     return NF_ACCEPT;
>> >
>> > This will not work because the packet losing race will get mangled by
>> > when NAT is performed.
>>
>> The idea of this patch is to resolve the conflict only among packets
>> with the same mangling (= with matching tuples). The mangling happens
>> before the confirmation. Please correct me if I'm missing something.
>
> Yes, the mangling happens before confirmation, but will mangle the
> source port to make sure two makes don't get the same tuple. Hence,
> the two packets will get different source port numbers, that's why we
> need the code to undo the NAT mangling in
> 368982cd7d1bd41cd39049c794990aca3770db44.

If two packets get different source port numbers (by
"get_unique_tuple"), then their CT tuples won't match (= "nf_ct_match"
will return false) => the conflict won't be resolved.

>
> Have a look at tcpdump to make sure packets are mangled consistently,
> not just going through / not dropped.

Yes, I've checked traces from tcpdump to see whether manglings are OK.
The logs I provided include the traces.

>
>> > Did you test this? Or this is some sort of RFC?
>>
>> I've tested it and put the steps for reproducing the issue and logs
>> for different configurations at https://github.com/brb/conntrack-race.
>
> OK, are you watching for packets drops to make sure your fix work, I
> mean, how are you doing the validation?

I'm validating the fix by logging packets for which the conflict
cannot be resolved and checking with tcpdump whether they have been
dropped.

>
> Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 10, 2018, 11:13 a.m. UTC | #5
On Mon, Jul 09, 2018 at 08:42:17PM +0200, Martynas Pumputis wrote:
> On 9 July 2018 at 20:12, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
[...]
> >> The idea of this patch is to resolve the conflict only among packets
> >> with the same mangling (= with matching tuples). The mangling happens
> >> before the confirmation. Please correct me if I'm missing something.
> >
> > Yes, the mangling happens before confirmation, but will mangle the
> > source port to make sure two makes don't get the same tuple. Hence,
> > the two packets will get different source port numbers, that's why we
> > need the code to undo the NAT mangling in
> > 368982cd7d1bd41cd39049c794990aca3770db44.
>
> If two packets get different source port numbers (by
> "get_unique_tuple"), then their CT tuples won't match (= "nf_ct_match"
> will return false) => the conflict won't be resolved.

I see, so this is just solving the conflict for a specific usecase
with NAT in place, ie. get_unique_tuple() returns same tuple...

But how so? With NAT in place, the packet losing race will eventually
get a different tuple, given the tuple that the first packet is using
would be already in use.

Do you have any other out-of-tree patch on top of this?

Or maybe I am missing anything? :-)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal July 10, 2018, 11:23 a.m. UTC | #6
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> I see, so this is just solving the conflict for a specific usecase
> with NAT in place, ie. get_unique_tuple() returns same tuple...
> 
> But how so? With NAT in place, the packet losing race will eventually
> get a different tuple, given the tuple that the first packet is using
> would be already in use.
> 
> Do you have any other out-of-tree patch on top of this?
> 
> Or maybe I am missing anything? :-)

AFAIU its locally running (threaded) process, so both skbs have
non-confirmed conntracks.

Thus, get_unique_tuple() can have high chance of not finding
reverse tuple in the conntrack table because the clashing one is
also not in global hash yet.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 10, 2018, 11:27 a.m. UTC | #7
On 10 July 2018 at 13:23, Florian Westphal <fw@strlen.de> wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> I see, so this is just solving the conflict for a specific usecase
>> with NAT in place, ie. get_unique_tuple() returns same tuple...
>>
>> But how so? With NAT in place, the packet losing race will eventually
>> get a different tuple, given the tuple that the first packet is using
>> would be already in use.
>>
>> Do you have any other out-of-tree patch on top of this?
>>
>> Or maybe I am missing anything? :-)
>
> AFAIU its locally running (threaded) process, so both skbs have
> non-confirmed conntracks.
>
> Thus, get_unique_tuple() can have high chance of not finding
> reverse tuple in the conntrack table because the clashing one is
> also not in global hash yet.

Exactly!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 10, 2018, 11:31 a.m. UTC | #8
On Tue, Jul 10, 2018 at 01:23:01PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > I see, so this is just solving the conflict for a specific usecase
> > with NAT in place, ie. get_unique_tuple() returns same tuple...
> > 
> > But how so? With NAT in place, the packet losing race will eventually
> > get a different tuple, given the tuple that the first packet is using
> > would be already in use.
> > 
> > Do you have any other out-of-tree patch on top of this?
> > 
> > Or maybe I am missing anything? :-)
> 
> AFAIU its locally running (threaded) process, so both skbs have
> non-confirmed conntracks.
> 
> Thus, get_unique_tuple() can have high chance of not finding
> reverse tuple in the conntrack table because the clashing one is
> also not in global hash yet.

Ah I see, it's indeed different scenario than the one I fixed for
nfqueue.

OK, I'm going to request some minor changes to this patch and we move
on.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 10, 2018, 11:37 a.m. UTC | #9
On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
> This patch enables the clash resolution for NAT (disabled in
> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
> and a protocol allows it.
> 
> The clash might happen for a connections-less protocol (e.g. UDP) when
> two threads in parallel writes to the same socket and consequent calls
> to "get_unique_tuple" return the same tuples (incl. reply tuples).
> 
> In this case it is safe to perform the resolution, as the losing CT
> describes the same mangling as the winning CT, so no modifications to
> the packet are needed, and the result of rules traversal for the loser's
> packet stays valid.

Applied, thanks.

Will make a minor comestic change before applying, see below.

> Signed-off-by: Martynas Pumputis <martynas@weave.works>
> ---
>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>  1 file changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 3465da2a98bd..cc5ef8c9d0da 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>  	       net_eq(net, nf_ct_net(ct));
>  }
>  
> +static inline bool
> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
> +{
> +	return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> +				 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
> +	       nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
> +				 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
> +	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
> +	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
> +	       net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
> +}
> +
>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>  static void nf_ct_gc_expired(struct nf_conn *ct)
>  {
> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>  	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>  	const struct nf_conntrack_l4proto *l4proto;
>  
> +	enum ip_conntrack_info oldinfo;
> +	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> +

Will remove this empty line break.

>  	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>  	if (l4proto->allow_clash &&
> -	    ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>  	    !nf_ct_is_dying(ct) &&
>  	    atomic_inc_not_zero(&ct->ct_general.use)) {
> -		enum ip_conntrack_info oldinfo;
> -		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
> -
> -		nf_ct_acct_merge(ct, ctinfo, loser_ct);
> -		nf_conntrack_put(&loser_ct->ct_general);
> -		nf_ct_set(skb, ct, oldinfo);
> -		return NF_ACCEPT;
> +		if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
> +		    nf_ct_match(ct, loser_ct)) {
> +			nf_ct_acct_merge(ct, ctinfo, loser_ct);
> +			nf_conntrack_put(&loser_ct->ct_general);
> +			nf_ct_set(skb, ct, oldinfo);
> +			return NF_ACCEPT;
> +		}
> +		nf_ct_put(ct);
>  	}
>  	NF_CT_STAT_INC(net, drop);
>  	return NF_DROP;
> -- 
> 2.18.0
> 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 10, 2018, 11:56 a.m. UTC | #10
On 10 July 2018 at 13:37, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
>> This patch enables the clash resolution for NAT (disabled in
>> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
>> and a protocol allows it.
>>
>> The clash might happen for a connections-less protocol (e.g. UDP) when
>> two threads in parallel writes to the same socket and consequent calls
>> to "get_unique_tuple" return the same tuples (incl. reply tuples).
>>
>> In this case it is safe to perform the resolution, as the losing CT
>> describes the same mangling as the winning CT, so no modifications to
>> the packet are needed, and the result of rules traversal for the loser's
>> packet stays valid.
>
> Applied, thanks.
>
> Will make a minor comestic change before applying, see below.

Thanks!

>
>> Signed-off-by: Martynas Pumputis <martynas@weave.works>
>> ---
>>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>>  1 file changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
>> index 3465da2a98bd..cc5ef8c9d0da 100644
>> --- a/net/netfilter/nf_conntrack_core.c
>> +++ b/net/netfilter/nf_conntrack_core.c
>> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>>              net_eq(net, nf_ct_net(ct));
>>  }
>>
>> +static inline bool
>> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
>> +{
>> +     return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
>> +                              &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
>> +            nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
>> +                              &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
>> +            net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
>> +}
>> +
>>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>>  static void nf_ct_gc_expired(struct nf_conn *ct)
>>  {
>> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>>       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>       const struct nf_conntrack_l4proto *l4proto;
>>
>> +     enum ip_conntrack_info oldinfo;
>> +     struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> +
>
> Will remove this empty line break.
>
>>       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>>       if (l4proto->allow_clash &&
>> -         ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>>           !nf_ct_is_dying(ct) &&
>>           atomic_inc_not_zero(&ct->ct_general.use)) {
>> -             enum ip_conntrack_info oldinfo;
>> -             struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>> -
>> -             nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> -             nf_conntrack_put(&loser_ct->ct_general);
>> -             nf_ct_set(skb, ct, oldinfo);
>> -             return NF_ACCEPT;
>> +             if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
>> +                 nf_ct_match(ct, loser_ct)) {
>> +                     nf_ct_acct_merge(ct, ctinfo, loser_ct);
>> +                     nf_conntrack_put(&loser_ct->ct_general);
>> +                     nf_ct_set(skb, ct, oldinfo);
>> +                     return NF_ACCEPT;
>> +             }
>> +             nf_ct_put(ct);
>>       }
>>       NF_CT_STAT_INC(net, drop);
>>       return NF_DROP;
>> --
>> 2.18.0
>>
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 15, 2018, 8:43 a.m. UTC | #11
As the patch is in the "Changes Requested" state and to avoid any
misunderstanding, do you want me to re-submit the patch with the minor
changes applied or are you going to do it yourself? Thanks.

On 10 July 2018 at 13:56, Martynas Pumputis <martynas@weave.works> wrote:
> On 10 July 2018 at 13:37, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> On Mon, Jul 02, 2018 at 04:52:14PM +0200, Martynas Pumputis wrote:
>>> This patch enables the clash resolution for NAT (disabled in
>>> "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
>>> and a protocol allows it.
>>>
>>> The clash might happen for a connections-less protocol (e.g. UDP) when
>>> two threads in parallel writes to the same socket and consequent calls
>>> to "get_unique_tuple" return the same tuples (incl. reply tuples).
>>>
>>> In this case it is safe to perform the resolution, as the losing CT
>>> describes the same mangling as the winning CT, so no modifications to
>>> the packet are needed, and the result of rules traversal for the loser's
>>> packet stays valid.
>>
>> Applied, thanks.
>>
>> Will make a minor comestic change before applying, see below.
>
> Thanks!
>
>>
>>> Signed-off-by: Martynas Pumputis <martynas@weave.works>
>>> ---
>>>  net/netfilter/nf_conntrack_core.c | 31 +++++++++++++++++++++++--------
>>>  1 file changed, 23 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
>>> index 3465da2a98bd..cc5ef8c9d0da 100644
>>> --- a/net/netfilter/nf_conntrack_core.c
>>> +++ b/net/netfilter/nf_conntrack_core.c
>>> @@ -502,6 +502,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
>>>              net_eq(net, nf_ct_net(ct));
>>>  }
>>>
>>> +static inline bool
>>> +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
>>> +{
>>> +     return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
>>> +                              &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
>>> +            nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
>>> +                              &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
>>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
>>> +            nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
>>> +            net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
>>> +}
>>> +
>>>  /* caller must hold rcu readlock and none of the nf_conntrack_locks */
>>>  static void nf_ct_gc_expired(struct nf_conn *ct)
>>>  {
>>> @@ -696,18 +708,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
>>>       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>>       const struct nf_conntrack_l4proto *l4proto;
>>>
>>> +     enum ip_conntrack_info oldinfo;
>>> +     struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>>> +
>>
>> Will remove this empty line break.
>>
>>>       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
>>>       if (l4proto->allow_clash &&
>>> -         ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
>>>           !nf_ct_is_dying(ct) &&
>>>           atomic_inc_not_zero(&ct->ct_general.use)) {
>>> -             enum ip_conntrack_info oldinfo;
>>> -             struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
>>> -
>>> -             nf_ct_acct_merge(ct, ctinfo, loser_ct);
>>> -             nf_conntrack_put(&loser_ct->ct_general);
>>> -             nf_ct_set(skb, ct, oldinfo);
>>> -             return NF_ACCEPT;
>>> +             if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
>>> +                 nf_ct_match(ct, loser_ct)) {
>>> +                     nf_ct_acct_merge(ct, ctinfo, loser_ct);
>>> +                     nf_conntrack_put(&loser_ct->ct_general);
>>> +                     nf_ct_set(skb, ct, oldinfo);
>>> +                     return NF_ACCEPT;
>>> +             }
>>> +             nf_ct_put(ct);
>>>       }
>>>       NF_CT_STAT_INC(net, drop);
>>>       return NF_DROP;
>>> --
>>> 2.18.0
>>>
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 16, 2018, 3:04 p.m. UTC | #12
On Sun, Jul 15, 2018 at 10:43:46AM +0200, Martynas Pumputis wrote:
> As the patch is in the "Changes Requested" state and to avoid any
> misunderstanding, do you want me to re-submit the patch with the minor
> changes applied or are you going to do it yourself? Thanks.

Just changed it back to "Under review".

Still traveling back home, will be handling this asap.

Thanks for your patience.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso July 16, 2018, 3:18 p.m. UTC | #13
On Mon, Jul 16, 2018 at 05:04:14PM +0200, Pablo Neira Ayuso wrote:
> On Sun, Jul 15, 2018 at 10:43:46AM +0200, Martynas Pumputis wrote:
> > As the patch is in the "Changes Requested" state and to avoid any
> > misunderstanding, do you want me to re-submit the patch with the minor
> > changes applied or are you going to do it yourself? Thanks.
> 
> Just changed it back to "Under review".
> 
> Still traveling back home, will be handling this asap.

To clarify, no need to resubmit, will just do the little nitpick
mangling I propose and apply.

Thanks Martynas.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martynas Pumputis July 17, 2018, 8:20 a.m. UTC | #14
Nice, thanks!

On 16 July 2018 at 17:18, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Jul 16, 2018 at 05:04:14PM +0200, Pablo Neira Ayuso wrote:
>> On Sun, Jul 15, 2018 at 10:43:46AM +0200, Martynas Pumputis wrote:
>> > As the patch is in the "Changes Requested" state and to avoid any
>> > misunderstanding, do you want me to re-submit the patch with the minor
>> > changes applied or are you going to do it yourself? Thanks.
>>
>> Just changed it back to "Under review".
>>
>> Still traveling back home, will be handling this asap.
>
> To clarify, no need to resubmit, will just do the little nitpick
> mangling I propose and apply.
>
> Thanks Martynas.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3465da2a98bd..cc5ef8c9d0da 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -502,6 +502,18 @@  nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	       net_eq(net, nf_ct_net(ct));
 }
 
+static inline bool
+nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+	return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+	       nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+				 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
+	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
+	       net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
+}
+
 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 static void nf_ct_gc_expired(struct nf_conn *ct)
 {
@@ -696,18 +708,21 @@  static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 	const struct nf_conntrack_l4proto *l4proto;
 
+	enum ip_conntrack_info oldinfo;
+	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto->allow_clash &&
-	    ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
-		enum ip_conntrack_info oldinfo;
-		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
-
-		nf_ct_acct_merge(ct, ctinfo, loser_ct);
-		nf_conntrack_put(&loser_ct->ct_general);
-		nf_ct_set(skb, ct, oldinfo);
-		return NF_ACCEPT;
+		if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+		    nf_ct_match(ct, loser_ct)) {
+			nf_ct_acct_merge(ct, ctinfo, loser_ct);
+			nf_conntrack_put(&loser_ct->ct_general);
+			nf_ct_set(skb, ct, oldinfo);
+			return NF_ACCEPT;
+		}
+		nf_ct_put(ct);
 	}
 	NF_CT_STAT_INC(net, drop);
 	return NF_DROP;