netfilter: nf_nat: return the same reply tuple for matching CTs

Message ID 20180802180557.14217-1-martynas@weave.works
State Under Review
Delegated to: Pablo Neira
Headers show
Series
  • netfilter: nf_nat: return the same reply tuple for matching CTs
Related show

Commit Message

Martynas Pumputis Aug. 2, 2018, 6:05 p.m.
It is possible that two concurrent packets originating from the same
socket of a connection-less protocol (e.g. UDP) can end up having
different IP_CT_DIR_REPLY tuples which results in one of the packets
being dropped.

To illustrate this, consider the following simplified scenario:

1. No DNAT/SNAT/MASQUEARADE rules are installed, but the nf_nat module
   is loaded.
2. Packet A and B are sent at the same time from two different threads
   via the same UDP socket which hasn't been used before (=no CT has
   been created before). Both packets have the same IP_CT_DIR_ORIGINAL
   tuple.
3. CT of A has been created and confirmed, afterwards get_unique_tuple
   is called for B. Because IP_CT_DIR_REPLY tuple (the inverse of
   the IP_CT_DIR_ORIGINAL tuple) is already taken by the A's confirmed
   CT (nf_nat_used_tuple finds it), get_unique_tuple calls UDP's
   unique_tuple which returns a different IP_CT_DIR_REPLY tuple (usually
   with src port = 1024)
4. B's CT cannot get confirmed in __nf_conntrack_confirm due to
   the found IP_CT_DIR_ORIGINAL tuple of A and the different
   IP_CT_DIR_REPLY tuples, thus the packet B gets dropped.

This patch modifies get_unique_tuple in a way that the function might
return the already used by a confirmed CT reply tuple if a L4 protocol
allows the clash resolution and IP_CT_DIR_ORIGINAL tuples are equal.

Signed-off-by: Martynas Pumputis <martynas@weave.works>

---

I've tested the patch with https://github.com/brb/conntrack-race in three
different scenarios (no NAT, SNAT, DNAT) by checking "insert_failed" and
"drop" counters, PCAP traces and dynamic debug logs.
---
 include/net/netfilter/nf_conntrack.h     |  5 ++--
 include/net/netfilter/nf_nat.h           |  3 ++-
 net/ipv4/netfilter/nf_nat_proto_gre.c    |  2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c        | 12 ++++++---
 net/netfilter/nf_nat_core.c              | 34 +++++++++++++++++++-----
 net/netfilter/nf_nat_proto_common.c      |  2 +-
 8 files changed, 46 insertions(+), 16 deletions(-)

Patch

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 062dc19b5840..498d5d8159f5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -135,8 +135,9 @@  void nf_conntrack_alter_reply(struct nf_conn *ct,
 
 /* Is this tuple taken? (ignoring any belonging to the given
    conntrack). */
-int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-			     const struct nf_conn *ignored_conntrack);
+int nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple,
+			     const struct nf_conn *ignored_conntrack,
+			     bool ignore_same_orig);
 
 #define NFCT_INFOMASK	7UL
 #define NFCT_PTRMASK	~(NFCT_INFOMASK)
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index a17eb2f8d40e..fee9737a65a7 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -49,7 +49,8 @@  struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct);
 
 /* Is this tuple already taken? (not by us)*/
 int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-		      const struct nf_conn *ignored_conntrack);
+		      const struct nf_conn *ignored_conntrack,
+		      bool ignore_same_orig);
 
 static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 00fda6331ce5..c3083b68d3c2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -72,7 +72,7 @@  gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 
 	for (i = 0; ; ++key) {
 		*keyptr = htons(min + key % range_size);
-		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
 			return;
 	}
 
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 6d7cf1d79baf..589e9a9b5509 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -47,7 +47,7 @@  icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 	for (i = 0; ; ++id) {
 		tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
 					     (id % range_size));
-		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
 			return;
 	}
 	return;
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index d9bf42ba44fa..cf47f5f549ee 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -49,7 +49,7 @@  icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
 	for (i = 0; ; ++id) {
 		tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
 					     (id % range_size));
-		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
 			return;
 	}
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index cc5ef8c9d0da..43a53eaff82c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -848,8 +848,9 @@  EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 /* Returns true if a connection correspondings to the tuple (required
    for NAT). */
 int
-nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-			 const struct nf_conn *ignored_conntrack)
+nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple,
+			       const struct nf_conn *ignored_conntrack,
+			       bool ignore_same_orig)
 {
 	struct net *net = nf_ct_net(ignored_conntrack);
 	const struct nf_conntrack_zone *zone;
@@ -878,6 +879,11 @@  nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		}
 
 		if (nf_ct_key_equal(h, tuple, zone, net)) {
+			if (ignore_same_orig &&
+			    nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+					      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+				continue;
+			}
 			NF_CT_STAT_INC_ATOMIC(net, found);
 			rcu_read_unlock();
 			return 1;
@@ -893,7 +899,7 @@  nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
+EXPORT_SYMBOL_GPL(nf_conntrack_reply_tuple_taken);
 
 #define NF_CT_EVICTION_RANGE	8
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 46f9df99d276..12fb39d953e0 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -154,7 +154,8 @@  hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 /* Is this tuple already taken? (not by us) */
 int
 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-		  const struct nf_conn *ignored_conntrack)
+		  const struct nf_conn *ignored_conntrack,
+		  bool ignore_same_orig)
 {
 	/* Conntrack tracking doesn't keep track of outgoing tuples; only
 	 * incoming ones.  NAT means they don't have a fixed mapping,
@@ -165,7 +166,15 @@  nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
 	struct nf_conntrack_tuple reply;
 
 	nf_ct_invert_tuplepr(&reply, tuple);
-	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+	/* If ignore_same_orig is enabled, the following function will ignore
+	 * any matching CT with the same IP_CT_DIR_ORIGINAL tuple.
+	 *
+	 * Used when calling the function for a CT of a connection-less protocol
+	 * such as UDP to ignore a clashing CT which originated from the same
+	 * socket.
+	 */
+	return nf_conntrack_reply_tuple_taken(&reply, ignored_conntrack,
+					      ignore_same_orig);
 }
 EXPORT_SYMBOL(nf_nat_used_tuple);
 
@@ -323,7 +332,9 @@  get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	const struct nf_conntrack_zone *zone;
 	const struct nf_nat_l3proto *l3proto;
 	const struct nf_nat_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *ct_l4proto;
 	struct net *net = nf_ct_net(ct);
+	bool ignore_same_orig = false;
 
 	zone = nf_ct_zone(ct);
 
@@ -331,6 +342,16 @@  get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
 	l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
 					orig_tuple->dst.protonum);
+	ct_l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+
+	/* If the protocol allows the clash resolution, then when searching
+	 * for clashing CTs ignore the ones with the same IP_CT_DIR_ORIGINAL
+	 * tuple as they originate from the same socket. This prevents from
+	 * generating different reply tuples for two racing packets from
+	 * the same connection-less (e.g. UDP) socket which results in dropping
+	 * one of the packets in __nf_conntrack_confirm.
+	 */
+	ignore_same_orig = ct_l4proto->allow_clash;
 
 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
 	 * and that same mapping gives a unique tuple within the given
@@ -344,14 +365,15 @@  get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 		/* try the original tuple first */
 		if (in_range(l3proto, l4proto, orig_tuple, range)) {
-			if (!nf_nat_used_tuple(orig_tuple, ct)) {
+			if (!nf_nat_used_tuple(orig_tuple, ct,
+					       ignore_same_orig)) {
 				*tuple = *orig_tuple;
 				goto out;
 			}
 		} else if (find_appropriate_src(net, zone, l3proto, l4proto,
 						orig_tuple, tuple, range)) {
 			pr_debug("get_unique_tuple: Found current src map\n");
-			if (!nf_nat_used_tuple(tuple, ct))
+			if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig))
 				goto out;
 		}
 	}
@@ -372,9 +394,9 @@  get_unique_tuple(struct nf_conntrack_tuple *tuple,
 			          &range->min_proto,
 			          &range->max_proto) &&
 			    (range->min_proto.all == range->max_proto.all ||
-			     !nf_nat_used_tuple(tuple, ct)))
+			     !nf_nat_used_tuple(tuple, ct, ignore_same_orig)))
 				goto out;
-		} else if (!nf_nat_used_tuple(tuple, ct)) {
+		} else if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig)) {
 			goto out;
 		}
 	}
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 5d849d835561..851517cdfbd7 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -91,7 +91,7 @@  void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 
 	for (i = 0; ; ++off) {
 		*portptr = htons(min + off % range_size);
-		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+		if (++i != range_size && nf_nat_used_tuple(tuple, ct, false))
 			continue;
 		if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
 					NF_NAT_RANGE_PROTO_OFFSET)))