diff mbox series

[RFC,3/6] net/sched: add CT action

Message ID 1ec3d8c3ec1256ae6cca2b498caac642c1ce09f0.1548285996.git.mleitner@redhat.com
State RFC
Delegated to: David Miller
Headers show
Series Initial, PoC implementation of sw datapath of tc+CT | expand

Commit Message

Marcelo Leitner Jan. 25, 2019, 2:32 a.m. UTC
This is where most of the code is and the main pain points.

The implementation is using spinlock on the datapath for now just for
simplicity. Lets get the basics done and then move forward.

Open points:
- nf_ct_netns_get() accepts IPv4, IPv6 or both. It would be interesting to
  match on what was specified to the filter, but not sure if that's really
  wanted neither how.
- iptables CT target can set a different zone for each direction and also
  infer it from the mark. These are NOT used by OvS. We can focus on this
  later but probably want to consider the need now.
- datapath fork
  As described in the planning RFC PATCH, OvS ct action creates a fork in the
  datapath: consider that the packet is being sent through conntrack. The
  original packet, without conntrack information, will first finish
  executing the current list of actions. After that is done, a packet clone
  created by the ct action will be inserted into the specified chain, and
  resume its processing. Somehow we need to be able to inject this packet
  and we can't use the interface backlog for it, as that would create a
  massive reordering.
- The handling of multiple calls to CT action is needed because the first
  call may be on a packet still with tunnel headers, and then without it.
  It is handled in a subsequent patch by dropping any conntrack present in
  the skb.

On protocol type on datapath, note that tc can match on both at once, IPv4
and IPv6.  So far we can't easily tell which filter tc is using. We could
tell conntrack to work with both (NFPROTO_INET), but that would be kind of
a lazy solution here.  Instead, lets trust the packet header: if it is
here, it's because tc matched, so we can either process it as IPv4 or IPv6.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
---
 include/net/tc_act/tc_ct.h        |  29 +++
 include/uapi/linux/tc_act/tc_ct.h |  36 +++
 net/sched/Kconfig                 |   6 +
 net/sched/Makefile                |   1 +
 net/sched/act_ct.c                | 356 ++++++++++++++++++++++++++++++
 5 files changed, 428 insertions(+)
 create mode 100644 include/net/tc_act/tc_ct.h
 create mode 100644 include/uapi/linux/tc_act/tc_ct.h
 create mode 100644 net/sched/act_ct.c
diff mbox series

Patch

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
new file mode 100644
index 0000000000000000000000000000000000000000..65682460f501b5886d9266f811c8ed30a4510304
--- /dev/null
+++ b/include/net/tc_act/tc_ct.h
@@ -0,0 +1,29 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CT_H
+#define __NET_TC_CT_H
+
+#include <linux/types.h>
+#include <net/act_api.h>
+#include <uapi/linux/netfilter/xt_connlabel.h>
+
+struct tcf_ct {
+	struct tc_action common;
+	struct net *net;
+
+	u16 zone;
+	u32 mark;
+	u32 mark_mask;
+	u32 chain;
+
+	/* FIXME: Use nf_conn_labels instead? But it pulls all netfilter */
+#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
+	u32 label[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
+	u32 label_mask[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
+
+	u32 flags;
+	struct nf_conn *ct;
+};
+
+#define to_tcf_ct(a) ((struct tcf_ct *)a)
+
+#endif /* __NET_TC_CT_H */
diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
new file mode 100644
index 0000000000000000000000000000000000000000..37b95cda1dedd283b0244a03a20860ba22966dfa
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -0,0 +1,36 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_TC_CT_H
+#define __LINUX_TC_CT_H
+
+#include <linux/pkt_cls.h>
+#include <linux/types.h>
+
+#define TCA_ACT_CT 27
+
+enum {
+	TCA_CT_UNSPEC,
+	TCA_CT_TM,
+	TCA_CT_PARMS,
+	TCA_CT_PAD,
+	TCA_CT_ZONE,
+	TCA_CT_MARK,
+	TCA_CT_MARK_MASK,
+	TCA_CT_LABEL,
+	TCA_CT_LABEL_MASK,
+	TCA_CT_CHAIN,
+	TCA_CT_FLAGS,
+	__TCA_CT_MAX
+};
+#define TCA_CT_MAX (__TCA_CT_MAX - 1)
+
+enum {
+	TC_CT_COMMIT,
+	__TC_CT_MAX
+};
+#define TC_CT_MAX (__TC_CT_MAX - 1)
+
+struct tc_ct {
+	tc_gen;
+};
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 1b9afdee5ba976ba64200d8f85050cf053b7d65c..2c7f963b78f7511bbee8814b1c5bfdb488386c5d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -912,6 +912,12 @@  config NET_ACT_TUNNEL_KEY
 	  To compile this code as a module, choose M here: the
 	  module will be called act_tunnel_key.
 
+config NET_ACT_CT
+        tristate "Conntrack manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  FIXME
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c420d86427933a9af383e093812b7..f2f6db5b8352a9594b72bc6197caf2228b45c079 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -23,6 +23,7 @@  obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
+obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
 obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 0000000000000000000000000000000000000000..f69509954149a0c8be710916a5289a4448049b5d
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,356 @@ 
+/*
+ * Conntrack manipulation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tc_act/tc_ct.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/tc_act/tc_ct.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+static unsigned int ct_net_id;
+static struct tc_action_ops act_ct_ops;
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+	[TCA_CT_PARMS]		= { .len = sizeof(struct tc_ct) },
+	[TCA_CT_ZONE]		= { .type = NLA_U16 },
+	[TCA_CT_MARK]		= { .type = NLA_U32 },
+	[TCA_CT_MARK_MASK]	= { .type = NLA_U32 },
+	[TCA_CT_LABEL]		= { .type = NLA_BINARY,
+				    .len = 128/BITS_PER_BYTE },
+	[TCA_CT_LABEL_MASK]	= { .type = NLA_BINARY,
+				    .len = 128/BITS_PER_BYTE },
+	[TCA_CT_CHAIN]		= { .type = NLA_U32 },
+	[TCA_CT_FLAGS]		= { .type = NLA_U32 },
+};
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+		       struct tc_action **a, int ovr, int bind,
+		       bool rtnl_held, struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+	struct nlattr *tb[TCA_CT_MAX + 1];
+	struct nf_conntrack_zone zone;
+	struct nf_conn *ct, *_ct;
+	struct tc_ct *parm;
+	int ret = 0, err;
+	struct tcf_ct *p;
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CT_PARMS])
+		return -EINVAL;
+	parm = nla_data(tb[TCA_CT_PARMS]);
+
+	if (tb[TCA_CT_ZONE])
+		zone_id = nla_get_u16(tb[TCA_CT_ZONE]);
+
+	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	if (!err) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_ct_ops, bind, false);
+		if (ret) {
+			tcf_idr_cleanup(tn, parm->index);
+			return ret;
+		}
+		ret = ACT_P_CREATED;
+	} else if (err > 0) {
+		if (bind)
+			return 0;
+		if (!ovr) {
+			ret = -EEXIST;
+			goto err1;
+		}
+	} else {
+		return err;
+	}
+
+	/* XXX Need translation from AF_INET to NFPROTO_ */
+	err = nf_ct_netns_get(net, NFPROTO_IPV4 /* XXX par->family */);
+	if (err < 0) {
+		ret = err;
+		goto err1;
+	}
+
+	/* XXX: CT target supports setting a different zone on each direction */
+	/* XXX: CT supports inferring zone id from the mark, but we probably
+	 * don't need that here.
+	if (info->flags & XT_CT_ZONE_MARK)
+		zone.flags |= NF_CT_FLAG_MARK;
+	 */
+	nf_ct_zone_init(&zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
+
+	ct = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+	if (!ct) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	nf_conntrack_get(&ct->ct_general);
+
+	p = to_tcf_ct(*a);
+	spin_lock_bh(&p->tcf_lock);
+	p->zone = zone_id;
+	if (tb[TCA_CT_MARK] && tb[TCA_CT_MARK_MASK]) {
+		p->mark = nla_get_u32(tb[TCA_CT_MARK]);
+		p->mark_mask = nla_get_u32(tb[TCA_CT_MARK_MASK]);
+	}
+	if (tb[TCA_CT_LABEL] && tb[TCA_CT_LABEL_MASK]) {
+		nla_memcpy(p->label, tb[TCA_CT_LABEL], sizeof(p->label));
+		nla_memcpy(p->label_mask, tb[TCA_CT_LABEL_MASK],
+			   sizeof(p->label_mask));
+		nf_connlabels_replace(ct, p->label, p->label_mask,
+				      sizeof(p->label)/sizeof(u32));
+	}
+	if (tb[TCA_CT_CHAIN])
+		p->chain = nla_get_u32(tb[TCA_CT_CHAIN]);
+	if (tb[TCA_CT_FLAGS])
+		p->flags = nla_get_u32(tb[TCA_CT_FLAGS]);
+	p->net = net;
+
+	p->tcf_action = parm->action;
+
+	_ct = p->ct;
+	p->ct = ct;
+
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (_ct) {
+		nf_conntrack_put(&_ct->ct_general);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+
+	return ret;
+
+err1:
+	tcf_idr_release(*a, bind);
+	return ret;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+	struct tcf_ct *p = to_tcf_ct(a);
+
+	if (p->ct) {
+		nf_conntrack_put(&p->ct->ct_general);
+	}
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+		      struct tcf_result *res)
+{
+	struct tcf_ct *p = to_tcf_ct(a);
+	struct nf_hook_state state = {
+		.hook = NF_INET_PRE_ROUTING,
+	};
+	struct nf_conn *ct, *new_ct;
+	u32 mark, mark_mask, flags;
+	int action, err;
+	int nh_ofs;
+
+	spin_lock(&p->tcf_lock);
+
+	tcf_lastuse_update(&p->tcf_tm);
+	mark = p->mark;
+	mark_mask = p->mark_mask;
+	flags = p->flags;
+	state.net = p->net;
+	action = p->tcf_action;
+	ct = p->ct;
+	if (ct)
+		/* This gets transferred to conntrack */
+		nf_conntrack_get(&ct->ct_general);
+
+	bstats_update(&p->tcf_bstats, skb);
+
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	/* FIXME: For when we support cloning the packet
+	orig_skb = skb;
+	skb = skb_clone(orig_skb, GFP_ATOMIC);
+	 */
+
+	/* The conntrack module expects to be working at L3. */
+	nh_ofs = skb_network_offset(skb);
+	skb_pull_rcsum(skb, nh_ofs);
+	/* FIXME: OvS trims the packet here. Should we? */
+
+	/* FIXME: Need to handle multiple calls to CT action here. */
+	if (ct)
+		nf_ct_set(skb, ct, IP_CT_NEW);
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		state.pf = NFPROTO_IPV6;
+	} else {
+		/* FIXME: should we restrict this even further? */
+		state.pf = NFPROTO_IPV4;
+	}
+
+	err = nf_conntrack_in(skb, &state);
+	if (err != NF_ACCEPT)
+		goto drop;
+
+	new_ct = (struct nf_conn *)skb_nfct(skb);
+	if (new_ct) {
+		if (mark_mask) {
+			new_ct->mark = (new_ct->mark &~ mark_mask) | (mark & mark_mask);
+			if (nf_ct_is_confirmed(new_ct))
+				nf_conntrack_event_cache(IPCT_MARK, new_ct);
+		}
+
+		nf_ct_deliver_cached_events(new_ct);
+	}
+
+	if (flags & BIT(TC_CT_COMMIT)) {
+		err = nf_conntrack_confirm(skb);
+		if (err != NF_ACCEPT) {
+			printk("failed to confirm %d\n", err);
+			goto drop;
+		}
+	}
+
+	/* FIXME: inject the packet into another chain (as it would happen if
+	 * it had a miss in hw too)
+	 */
+
+	skb_push(skb, nh_ofs);
+	skb_postpush_rcsum(skb, skb->data, nh_ofs);
+	return TC_ACT_PIPE;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+		       int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ct *p = to_tcf_ct(a);
+	struct tc_ct opt = {
+		.index    = p->tcf_index,
+		.refcnt   = refcount_read(&p->tcf_refcnt) - ref,
+		.bindcnt  = atomic_read(&p->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&p->tcf_lock);
+	nla_put_u16(skb, TCA_CT_ZONE, p->zone);
+	nla_put_u32(skb, TCA_CT_MARK, p->mark);
+	nla_put_u32(skb, TCA_CT_MARK_MASK, p->mark_mask);
+	nla_put_u32(skb, TCA_CT_CHAIN, p->chain);
+	nla_put(skb, TCA_CT_LABEL, sizeof(p->label), p->label);
+	nla_put(skb, TCA_CT_LABEL_MASK, sizeof(p->label_mask), p->label_mask);
+	nla_put_u32(skb, TCA_CT_FLAGS, p->flags);
+	opt.action = p->tcf_action;
+
+	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &p->tcf_tm);
+	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+		goto nla_put_failure;
+	spin_unlock_bh(&p->tcf_lock);
+
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&p->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+			 struct netlink_callback *cb, int type,
+			 const struct tc_action_ops *ops,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ct_ops = {
+	.kind		=	"ct",
+	.type		=	TCA_ACT_CT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ct_act,
+	.dump		=	tcf_ct_dump,
+	.init		=	tcf_ct_init,
+	.cleanup	=	tcf_ct_cleanup,
+	.walk		=	tcf_ct_walker,
+	.lookup		=	tcf_ct_search,
+	.size		=	sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tc_action_net_init(tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+	.init = ct_init_net,
+	.exit_batch = ct_exit_net,
+	.id   = &ct_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+MODULE_DESCRIPTION("Connection Tracking actions");
+MODULE_LICENSE("GPL");
+
+static int __init ct_init_module(void)
+{
+	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);