diff mbox series

[RFC,v2,net-next,07/12] net: ipv4: listified version of ip_rcv

Message ID f4b330bf-a1b3-2849-4265-faa0aaa1e941@solarflare.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Handle multiple received packets at each stage | expand

Commit Message

Edward Cree June 26, 2018, 6:20 p.m. UTC
Also involved adding a way to run a netfilter hook over a list of packets.
 Rather than attempting to make netfilter know about lists (which would be
 a major project in itself) we just let it call the regular okfn (in this
 case ip_rcv_finish()) for any packets it steals, and have it give us back
 a list of packets it's synchronously accepted (which normally NF_HOOK
 would automatically call okfn() on, but we want to be able to potentially
 pass the list to a listified version of okfn().)
The netfilter hooks themselves are indirect calls that still happen per-
 packet (see nf_hook_entry_hookfn()), but again, changing that can be left
 for future work.

There is potential for out-of-order receives if the netfilter hook ends up
 synchronously stealing packets, as they will be processed before any
 accepts earlier in the list.  However, it was already possible for an
 asynchronous accept to cause out-of-order receives, so presumably this is
 considered OK.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  3 ++
 include/linux/netfilter.h | 27 +++++++++++++++++
 include/net/ip.h          |  2 ++
 net/core/dev.c            | 11 +++++--
 net/ipv4/af_inet.c        |  1 +
 net/ipv4/ip_input.c       | 75 ++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 110 insertions(+), 9 deletions(-)

Comments

Florian Westphal June 27, 2018, 12:32 p.m. UTC | #1
Edward Cree <ecree@solarflare.com> wrote:
> Also involved adding a way to run a netfilter hook over a list of packets.
>  Rather than attempting to make netfilter know about lists (which would be
>  a major project in itself) we just let it call the regular okfn (in this
>  case ip_rcv_finish()) for any packets it steals, and have it give us back
>  a list of packets it's synchronously accepted (which normally NF_HOOK
>  would automatically call okfn() on, but we want to be able to potentially
>  pass the list to a listified version of okfn().)

okfn() is only used during async reinject in NFQUEUE case,
skb is queued in kernel and we'll wait for a verdict from a userspace
process.  If thats ACCEPT, then okfn() gets called to reinject the skb
into the network stack.

A normal -j ACCEPT doesn't call okfn in the netfilter core, which is why
this occurs on '1' retval in NF_HOOK().

Only other user of okfn() is bridge netfilter, so listified version of
okfn() doesn't make too much sense to me, its not used normally
(unless such listified version makes the code simpler of course).

AFAICS its fine to unlink/free skbs from the list to handle
drops/queueing etc. so a future version of nf_hook() could propagate the
list into nf_hook_slow and mangle the list there to deal with hooks
that steal/drop/queue skbs.

Later on we can pass the list to the hook functions themselves.

We'll have to handle non-accept verdicts in-place in the hook functions
for this, but fortunately most hookfns only return NF_ACCEPT so I think
it is manageable.

I'll look into this once the series makes it to net-next.
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 105087369779..5296354fa621 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2290,6 +2290,9 @@  struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	void			(*list_func) (struct sk_buff_head *,
+					      struct packet_type *,
+					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
 					    struct sock *sk);
 	void			*af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index dd2052f0efb7..42395a8a6e70 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -288,6 +288,22 @@  NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	return ret;
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct sk_buff_head *list, struct sk_buff_head *sublist,
+	     struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(sublist); /* list of synchronously ACCEPTed skbs */
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+		if (ret == 1)
+			__skb_queue_tail(sublist, skb);
+	}
+}
+
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
 		  unsigned int len);
@@ -369,6 +385,17 @@  NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	return okfn(net, sk, skb);
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct sk_buff_head *list, struct sk_buff_head *sublist,
+	     struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	__skb_queue_head_init(sublist);
+	/* Move everything to the sublist */
+	skb_queue_splice_init(list, sublist);
+}
+
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 			  struct sock *sk, struct sk_buff *skb,
 			  struct net_device *indev, struct net_device *outdev,
diff --git a/include/net/ip.h b/include/net/ip.h
index 0d2281b4b27a..fb3dfed537c0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -138,6 +138,8 @@  int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+		 struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2f46ed07c8d8..f0eb00e9fb57 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4684,8 +4684,15 @@  static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
 {
 	struct sk_buff *skb;
 
-	while ((skb = __skb_dequeue(list)) != NULL)
-		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	if (!pt_prev)
+		return;
+	if (skb_queue_empty(list))
+		return;
+	if (pt_prev->list_func != NULL)
+		pt_prev->list_func(list, pt_prev, orig_dev);
+	else
+		while ((skb = __skb_dequeue(list)) != NULL)
+			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
 static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..e54381fe4b00 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1882,6 +1882,7 @@  fs_initcall(ipv4_offload_init);
 static struct packet_type ip_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.func = ip_rcv,
+	.list_func = ip_list_rcv,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..7a8af8ff3f07 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -408,10 +408,9 @@  static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 /*
  * 	Main IP Receive routine.
  */
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 {
 	const struct iphdr *iph;
-	struct net *net;
 	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +420,6 @@  int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 
 
-	net = dev_net(dev);
 	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +487,7 @@  int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);
 
-	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
-		       net, NULL, skb, dev, NULL,
-		       ip_rcv_finish);
+	return skb;
 
 csum_error:
 	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +496,70 @@  int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 drop:
 	kfree_skb(skb);
 out:
-	return NET_RX_DROP;
+	return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+	   struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+
+	skb = ip_rcv_core(skb, net);
+	if (skb == NULL)
+		return NET_RX_DROP;
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		       net, NULL, skb, dev, NULL,
+		       ip_rcv_finish);
+}
+
+static void ip_sublist_rcv(struct sk_buff_head *list, struct net_device *dev,
+			   struct net *net)
+{
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+		     list, &sublist, dev, NULL, ip_rcv_finish);
+	while ((skb = __skb_dequeue(&sublist)) != NULL)
+		ip_rcv_finish(net, NULL, skb);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+		 struct net_device *orig_dev)
+{
+	struct net_device *curr_dev = NULL;
+	struct net *curr_net = NULL;
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&sublist);
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		struct net_device *dev = skb->dev;
+		struct net *net = dev_net(dev);
+
+		skb = ip_rcv_core(skb, net);
+		if (skb == NULL)
+			continue;
+
+		if (skb_queue_empty(&sublist)) {
+			curr_dev = dev;
+			curr_net = net;
+		} else if (curr_dev != dev || curr_net != net) {
+			/* dispatch old sublist */
+			ip_sublist_rcv(&sublist, dev, net);
+			/* start new sublist */
+			__skb_queue_head_init(&sublist);
+			curr_dev = dev;
+			curr_net = net;
+		}
+		/* add to current sublist */
+		__skb_queue_tail(&sublist, skb);
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv(&sublist, curr_dev, curr_net);
 }