diff mbox series

[v3,net-next,3/3] net/sched: sch_frag: add generic packet fragment support.

Message ID 1605829116-10056-4-git-send-email-wenxu@ucloud.cn
State Superseded
Headers show
Series net/sched: fix over mtu packet of defrag in | expand

Commit Message

wenxu Nov. 19, 2020, 11:38 p.m. UTC
From: wenxu <wenxu@ucloud.cn>

Currently kernel tc subsystem can do conntrack in cat_ct. But when several
fragment packets go through the act_ct, function tcf_ct_handle_fragments
will defrag the packets to a big one. But the last action will redirect
mirred to a device which maybe lead the reassembly big packet over the mtu
of target device.

This patch add support for a xmit hook to mirred, that gets executed before
xmiting the packet. Then, when act_ct gets loaded, it configs that hook.
The frag xmit hook maybe reused by other modules.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
v2: make act_frag just buildin for tc core but not a module
    return an error code from tcf_fragment
    depends on INET for ip_do_fragment
v3: put the whole sch_frag.c under CONFIG_INET

 include/net/act_api.h     |   8 +++
 include/net/sch_generic.h |   2 +
 net/sched/Makefile        |   1 +
 net/sched/act_api.c       |  44 ++++++++++++++
 net/sched/act_ct.c        |   7 +++
 net/sched/act_mirred.c    |   2 +-
 net/sched/sch_frag.c      | 150 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/sch_frag.c

Comments

Jakub Kicinski Nov. 24, 2020, 7:24 p.m. UTC | #1
On Fri, 20 Nov 2020 07:38:36 +0800 wenxu@ucloud.cn wrote:
> +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
> +{
> +	xmit_hook_func *xmit_hook;
> +
> +	xmit_hook = rcu_dereference(tcf_xmit_hook);
> +	if (xmit_hook)
> +		return xmit_hook(skb, xmit);
> +	else
> +		return xmit(skb);
> +}
> +EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);

I'm concerned about the performance impact of these indirect calls.

Did you check what code compiler will generate? What the impact with
retpolines enabled is going to be?

Now that sch_frag is no longer a module this could be simplified.

First of all - xmit_hook can only be sch_frag_xmit_hook, so please use
that directly. 

	if (READ_ONCE(tcf_xmit_hook_count)) 
		sch_frag_xmit_hook(...
	else
		dev_queue_xmit(...

The abstraction is costly and not necessary right now IMO.

Then probably the counter should be:

	u32 __read_mostly tcf_xmit_hook_count;

To avoid byte loads and having it be places in an unlucky cache line.

You could also make the helper a static inline in a header.


Unless I'm not giving the compiler enough credit and the performance
impact of this patch with retpolines on is indiscernible, but that'd
need to be proven by testing...
wenxu Nov. 24, 2020, 11:10 p.m. UTC | #2
在 2020/11/25 3:24, Jakub Kicinski 写道:
> On Fri, 20 Nov 2020 07:38:36 +0800 wenxu@ucloud.cn wrote:
>> +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
>> +{
>> +	xmit_hook_func *xmit_hook;
>> +
>> +	xmit_hook = rcu_dereference(tcf_xmit_hook);
>> +	if (xmit_hook)
>> +		return xmit_hook(skb, xmit);
>> +	else
>> +		return xmit(skb);
>> +}
>> +EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);
> I'm concerned about the performance impact of these indirect calls.
>
> Did you check what code compiler will generate? What the impact with
> retpolines enabled is going to be?
>
> Now that sch_frag is no longer a module this could be simplified.
>
> First of all - xmit_hook can only be sch_frag_xmit_hook, so please use
> that directly. 
>
> 	if (READ_ONCE(tcf_xmit_hook_count)) 
> 		sch_frag_xmit_hook(...
> 	else
> 		dev_queue_xmit(...
>
> The abstraction is costly and not necessary right now IMO.
>
> Then probably the counter should be:
>
> 	u32 __read_mostly tcf_xmit_hook_count;
>
> To avoid byte loads and having it be places in an unlucky cache line.
Maybe a static key replace  tcf_xmit_hook_count is more simplified?

DEFINE_STATIC_KEY_FALSE(tcf_xmit_hook_in_use);
Jakub Kicinski Nov. 25, 2020, 12:02 a.m. UTC | #3
On Wed, 25 Nov 2020 07:10:43 +0800 wenxu wrote:
> 在 2020/11/25 3:24, Jakub Kicinski 写道:
> > On Fri, 20 Nov 2020 07:38:36 +0800 wenxu@ucloud.cn wrote:  
> >> +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
> >> +{
> >> +	xmit_hook_func *xmit_hook;
> >> +
> >> +	xmit_hook = rcu_dereference(tcf_xmit_hook);
> >> +	if (xmit_hook)
> >> +		return xmit_hook(skb, xmit);
> >> +	else
> >> +		return xmit(skb);
> >> +}
> >> +EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);  
> > I'm concerned about the performance impact of these indirect calls.
> >
> > Did you check what code compiler will generate? What the impact with
> > retpolines enabled is going to be?
> >
> > Now that sch_frag is no longer a module this could be simplified.
> >
> > First of all - xmit_hook can only be sch_frag_xmit_hook, so please use
> > that directly. 
> >
> > 	if (READ_ONCE(tcf_xmit_hook_count)) 
> > 		sch_frag_xmit_hook(...
> > 	else
> > 		dev_queue_xmit(...
> >
> > The abstraction is costly and not necessary right now IMO.
> >
> > Then probably the counter should be:
> >
> > 	u32 __read_mostly tcf_xmit_hook_count;
> >
> > To avoid byte loads and having it be places in an unlucky cache line.  
> Maybe a static key replace  tcf_xmit_hook_count is more simplified?
> 
> DEFINE_STATIC_KEY_FALSE(tcf_xmit_hook_in_use);

I wasn't sure if static key would work with the module (mirred being a
module) but thinking about it again, if tcf_dev_queue_xmit() is not an
static inline but a normal function, it should work. Sounds good!
diff mbox series

Patch

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8721492..decb6de 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -239,6 +239,14 @@  int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
 			     struct netlink_ext_ack *newchain);
 struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
 					 struct tcf_chain *newchain);
+
+typedef int xmit_hook_func(struct sk_buff *skb,
+			   int (*xmit)(struct sk_buff *skb));
+
+int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));
+int tcf_set_xmit_hook(xmit_hook_func *xmit_hook);
+void tcf_clear_xmit_hook(void);
+
 #endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index dd74f06..162ed62 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1281,4 +1281,6 @@  void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
 void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
 				struct tcf_block *block);
 
+int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));
+
 #endif
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 66bbf9a..dd14ef4 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -5,6 +5,7 @@ 
 
 obj-y	:= sch_generic.o sch_mq.o
 
+obj-$(CONFIG_INET)		+= sch_frag.o
 obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)		+= cls_api.o
 obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 60e1572..fbb35a8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -22,6 +22,50 @@ 
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static xmit_hook_func __rcu *tcf_xmit_hook;
+static DEFINE_SPINLOCK(tcf_xmit_hook_lock);
+static u16 tcf_xmit_hook_count;
+
+int tcf_set_xmit_hook(xmit_hook_func *xmit_hook)
+{
+	spin_lock(&tcf_xmit_hook_lock);
+	if (!tcf_xmit_hook_count) {
+		rcu_assign_pointer(tcf_xmit_hook, xmit_hook);
+	} else if (xmit_hook != rcu_access_pointer(tcf_xmit_hook)) {
+		spin_unlock(&tcf_xmit_hook_lock);
+		return -EBUSY;
+	}
+
+	tcf_xmit_hook_count++;
+	spin_unlock(&tcf_xmit_hook_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcf_set_xmit_hook);
+
+void tcf_clear_xmit_hook(void)
+{
+	spin_lock(&tcf_xmit_hook_lock);
+	if (--tcf_xmit_hook_count == 0)
+		rcu_assign_pointer(tcf_xmit_hook, NULL);
+	spin_unlock(&tcf_xmit_hook_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcf_clear_xmit_hook);
+
+int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
+{
+	xmit_hook_func *xmit_hook;
+
+	xmit_hook = rcu_dereference(tcf_xmit_hook);
+	if (xmit_hook)
+		return xmit_hook(skb, xmit);
+	else
+		return xmit(skb);
+}
+EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);
+
 static void tcf_action_goto_chain_exec(const struct tc_action *a,
 				       struct tcf_result *res)
 {
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index aba3cd8..f82dc65 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1541,8 +1541,14 @@  static int __init ct_init_module(void)
 	if (err)
 		goto err_register;
 
+	err = tcf_set_xmit_hook(sch_frag_xmit_hook);
+	if (err)
+		goto err_action;
+
 	return 0;
 
+err_action:
+	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
 err_register:
 	tcf_ct_flow_tables_uninit();
 err_tbl_init:
@@ -1552,6 +1558,7 @@  static int __init ct_init_module(void)
 
 static void __exit ct_cleanup_module(void)
 {
+	tcf_clear_xmit_hook();
 	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
 	tcf_ct_flow_tables_uninit();
 	destroy_workqueue(act_ct_wq);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 17d0095..7153c67 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -210,7 +210,7 @@  static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
 	int err;
 
 	if (!want_ingress)
-		err = dev_queue_xmit(skb);
+		err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
 	else
 		err = netif_receive_skb(skb);
 
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
new file mode 100644
index 0000000..e1e77d3
--- /dev/null
+++ b/net/sched/sch_frag.c
@@ -0,0 +1,150 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+
+struct sch_frag_data {
+	unsigned long dst;
+	struct qdisc_skb_cb cb;
+	__be16 inner_protocol;
+	u16 vlan_tci;
+	__be16 vlan_proto;
+	unsigned int l2_len;
+	u8 l2_data[VLAN_ETH_HLEN];
+	int (*xmit)(struct sk_buff *skb);
+};
+
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage);
+
+static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+
+	if (skb_cow_head(skb, data->l2_len) < 0) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	__skb_dst_copy(skb, data->dst);
+	*qdisc_skb_cb(skb) = data->cb;
+	skb->inner_protocol = data->inner_protocol;
+	if (data->vlan_tci & VLAN_CFI_MASK)
+		__vlan_hwaccel_put_tag(skb, data->vlan_proto,
+				       data->vlan_tci & ~VLAN_CFI_MASK);
+	else
+		__vlan_hwaccel_clear_tag(skb);
+
+	/* Reconstruct the MAC header.  */
+	skb_push(skb, data->l2_len);
+	memcpy(skb->data, &data->l2_data, data->l2_len);
+	skb_postpush_rcsum(skb, skb->data, data->l2_len);
+	skb_reset_mac_header(skb);
+
+	return data->xmit(skb);
+}
+
+static void sch_frag_prepare_frag(struct sk_buff *skb,
+				  int (*xmit)(struct sk_buff *skb))
+{
+	unsigned int hlen = skb_network_offset(skb);
+	struct sch_frag_data *data;
+
+	data = this_cpu_ptr(&sch_frag_data_storage);
+	data->dst = skb->_skb_refdst;
+	data->cb = *qdisc_skb_cb(skb);
+	data->xmit = xmit;
+	data->inner_protocol = skb->inner_protocol;
+	if (skb_vlan_tag_present(skb))
+		data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK;
+	else
+		data->vlan_tci = 0;
+	data->vlan_proto = skb->vlan_proto;
+	data->l2_len = hlen;
+	memcpy(&data->l2_data, skb->data, hlen);
+
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	skb_pull(skb, hlen);
+}
+
+static unsigned int
+sch_frag_dst_get_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
+static struct dst_ops sch_frag_dst_ops = {
+	.family = AF_UNSPEC,
+	.mtu = sch_frag_dst_get_mtu,
+};
+
+static int sch_fragment(struct net *net, struct sk_buff *skb,
+			u16 mru, int (*xmit)(struct sk_buff *skb))
+{
+	int ret = -1;
+
+	if (skb_network_offset(skb) > VLAN_ETH_HLEN) {
+		net_warn_ratelimited("L2 header too long to fragment\n");
+		goto err;
+	}
+
+	if (skb_protocol(skb, true) == htons(ETH_P_IP)) {
+		struct dst_entry sch_frag_dst;
+		unsigned long orig_dst;
+
+		sch_frag_prepare_frag(skb, xmit);
+		dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1,
+			 DST_OBSOLETE_NONE, DST_NOCOUNT);
+		sch_frag_dst.dev = skb->dev;
+
+		orig_dst = skb->_skb_refdst;
+		skb_dst_set_noref(skb, &sch_frag_dst);
+		IPCB(skb)->frag_max_size = mru;
+
+		ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+		refdst_drop(orig_dst);
+	} else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
+		unsigned long orig_dst;
+		struct rt6_info sch_frag_rt;
+
+		sch_frag_prepare_frag(skb, xmit);
+		memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
+		dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+			 DST_OBSOLETE_NONE, DST_NOCOUNT);
+		sch_frag_rt.dst.dev = skb->dev;
+
+		orig_dst = skb->_skb_refdst;
+		skb_dst_set_noref(skb, &sch_frag_rt.dst);
+		IP6CB(skb)->frag_max_size = mru;
+
+		ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
+					       sch_frag_xmit);
+		refdst_drop(orig_dst);
+	} else {
+		net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
+				     netdev_name(skb->dev),
+				     ntohs(skb_protocol(skb, true)), mru,
+				     skb->dev->mtu);
+		goto err;
+	}
+
+	return ret;
+err:
+	kfree_skb(skb);
+	return ret;
+}
+
+int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
+{
+	u16 mru = qdisc_skb_cb(skb)->mru;
+	int err;
+
+	if (mru && skb->len > mru + skb->dev->hard_header_len)
+		err = sch_fragment(dev_net(skb->dev), skb, mru, xmit);
+	else
+		err = xmit(skb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(sch_frag_xmit_hook);