diff mbox

Add tcindex to conntrack and add netfilter target/matches

Message ID 1449179951-26327-2-git-send-email-luuk.paulussen@alliedtelesis.co.nz
State RFC
Delegated to: Pablo Neira
Headers show

Commit Message

Luuk Paulussen Dec. 3, 2015, 9:59 p.m. UTC
This patch implements support for setting/matching the skb->tc_index
field from IPtables, as well as allowing it to be saved/restored using
connection tracking.

This provides an alternative means to set the tc_index field (on top of
DSCP mapping), and also means that marking for traffic control purposes
doesn't need to go into the generic packet mark, which has space
limitations.

Once the tc_index field has been set, it can be matched with the existing
tcindex filter in the scheduling code.

Reviewed-by: Matt Bennett <matt.bennett@alliedtelesis.co.nz>
Reviewed-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
---
 include/net/netfilter/nf_conntrack.h               |   6 +-
 include/uapi/linux/netfilter/Kbuild                |   4 +
 include/uapi/linux/netfilter/nf_conntrack_common.h |   1 +
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |   1 +
 include/uapi/linux/netfilter/xt_CONNTCINDEX.h      |   6 +
 include/uapi/linux/netfilter/xt_TCINDEX.h          |   6 +
 include/uapi/linux/netfilter/xt_conntcindex.h      |  31 ++++
 include/uapi/linux/netfilter/xt_tcindex.h          |  15 ++
 net/netfilter/Kconfig                              |  29 ++++
 net/netfilter/Makefile                             |   2 +
 net/netfilter/nf_conntrack_netlink.c               |  39 ++++-
 net/netfilter/xt_conntcindex.c                     | 165 +++++++++++++++++++++
 net/netfilter/xt_tcindex.c                         |  84 +++++++++++
 13 files changed, 385 insertions(+), 4 deletions(-)
 create mode 100644 include/uapi/linux/netfilter/xt_CONNTCINDEX.h
 create mode 100644 include/uapi/linux/netfilter/xt_TCINDEX.h
 create mode 100644 include/uapi/linux/netfilter/xt_conntcindex.h
 create mode 100644 include/uapi/linux/netfilter/xt_tcindex.h
 create mode 100644 net/netfilter/xt_conntcindex.c
 create mode 100644 net/netfilter/xt_tcindex.c

Comments

Luuk Paulussen Dec. 6, 2015, 10:28 p.m. UTC | #1
Hi All,

I'm still hoping for some feedback on this.  I have some userspace 
patches around this as well, (to set/show the tc_index in the 
connection, and to add the marking/matching rules in iptables), but I am 
holding off on sending them until I know what people think of this 
idea/implementation first.

Basically it allows 16 bits of marking in skb and connmark for traffic 
control purposes using an existing field in the skb.

I'm hoping for feedback on the following:
* The idea in itself - is this an acceptable solution
* Naming of match/target for marking tc_index
* Feedback on the patch itself - Have I missed something, should the 
compilation conditions be different, etc.

Kind regards,
Luuk

On 12/04/2015 10:59 AM, Luuk Paulussen wrote:
> This patch implements support for setting/matching the skb->tc_index
> field from IPtables, as well as allowing it to be saved/restored using
> connection tracking.
>
> This provides an alternative means to set the tc_index field (on top of
> DSCP mapping), and also means that marking for traffic control purposes
> doesn't need to go into the generic packet mark, which has space
> limitations.
>
> Once the tc_index field has been set, it can be matched with the existing
> tcindex filter in the scheduling code.
>
> Reviewed-by: Matt Bennett <matt.bennett@alliedtelesis.co.nz>
> Reviewed-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
> ---
>   include/net/netfilter/nf_conntrack.h               |   6 +-
>   include/uapi/linux/netfilter/Kbuild                |   4 +
>   include/uapi/linux/netfilter/nf_conntrack_common.h |   1 +
>   include/uapi/linux/netfilter/nfnetlink_conntrack.h |   1 +
>   include/uapi/linux/netfilter/xt_CONNTCINDEX.h      |   6 +
>   include/uapi/linux/netfilter/xt_TCINDEX.h          |   6 +
>   include/uapi/linux/netfilter/xt_conntcindex.h      |  31 ++++
>   include/uapi/linux/netfilter/xt_tcindex.h          |  15 ++
>   net/netfilter/Kconfig                              |  29 ++++
>   net/netfilter/Makefile                             |   2 +
>   net/netfilter/nf_conntrack_netlink.c               |  39 ++++-
>   net/netfilter/xt_conntcindex.c                     | 165 +++++++++++++++++++++
>   net/netfilter/xt_tcindex.c                         |  84 +++++++++++
>   13 files changed, 385 insertions(+), 4 deletions(-)
>   create mode 100644 include/uapi/linux/netfilter/xt_CONNTCINDEX.h
>   create mode 100644 include/uapi/linux/netfilter/xt_TCINDEX.h
>   create mode 100644 include/uapi/linux/netfilter/xt_conntcindex.h
>   create mode 100644 include/uapi/linux/netfilter/xt_tcindex.h
>   create mode 100644 net/netfilter/xt_conntcindex.c
>   create mode 100644 net/netfilter/xt_tcindex.c
>
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index fde4068..9b0ab48 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -105,7 +105,11 @@ struct nf_conn {
>   
>   #if defined(CONFIG_NF_CONNTRACK_MARK)
>   	u_int32_t mark;
> -#endif
> +
> +#ifdef CONFIG_NET_SCHED
> +	u_int16_t tc_index;
> +#endif /* CONFIG_NET_SCHED */
> +#endif /* CONFIG_NF_CONNTRACK_MARK */
>   
>   #ifdef CONFIG_NF_CONNTRACK_SECMARK
>   	u_int32_t secmark;
> diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
> index 1d973d2..fedaaab 100644
> --- a/include/uapi/linux/netfilter/Kbuild
> +++ b/include/uapi/linux/netfilter/Kbuild
> @@ -22,6 +22,7 @@ header-y += xt_CHECKSUM.h
>   header-y += xt_CLASSIFY.h
>   header-y += xt_CONNMARK.h
>   header-y += xt_CONNSECMARK.h
> +header-y += xt_CONNTCINDEX.h
>   header-y += xt_CT.h
>   header-y += xt_DSCP.h
>   header-y += xt_HMARK.h
> @@ -33,6 +34,7 @@ header-y += xt_NFLOG.h
>   header-y += xt_NFQUEUE.h
>   header-y += xt_RATEEST.h
>   header-y += xt_SECMARK.h
> +header-y += xt_TCINDEX.h
>   header-y += xt_TCPMSS.h
>   header-y += xt_TCPOPTSTRIP.h
>   header-y += xt_TEE.h
> @@ -46,6 +48,7 @@ header-y += xt_connbytes.h
>   header-y += xt_connlabel.h
>   header-y += xt_connlimit.h
>   header-y += xt_connmark.h
> +header-y += xt_conntcindex.h
>   header-y += xt_conntrack.h
>   header-y += xt_cpu.h
>   header-y += xt_dccp.h
> @@ -81,6 +84,7 @@ header-y += xt_socket.h
>   header-y += xt_state.h
>   header-y += xt_statistic.h
>   header-y += xt_string.h
> +header-y += xt_tcindex.h
>   header-y += xt_tcpmss.h
>   header-y += xt_tcpudp.h
>   header-y += xt_time.h
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
> index 319f471..b211bb8 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
> @@ -107,6 +107,7 @@ enum ip_conntrack_events {
>   	IPCT_NATSEQADJ = IPCT_SEQADJ,
>   	IPCT_SECMARK,		/* new security mark has been set */
>   	IPCT_LABEL,		/* new connlabel has been set */
> +	IPCT_TCINDEX,		/* new tc_index has been set */
>   };
>   
>   enum ip_conntrack_expect_events {
> diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
> index c1a4e144..cfdd15f 100644
> --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
> +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
> @@ -53,6 +53,7 @@ enum ctattr_type {
>   	CTA_MARK_MASK,
>   	CTA_LABELS,
>   	CTA_LABELS_MASK,
> +	CTA_TC_INDEX,
>   	__CTA_MAX
>   };
>   #define CTA_MAX (__CTA_MAX - 1)
> diff --git a/include/uapi/linux/netfilter/xt_CONNTCINDEX.h b/include/uapi/linux/netfilter/xt_CONNTCINDEX.h
> new file mode 100644
> index 0000000..96eccf3
> --- /dev/null
> +++ b/include/uapi/linux/netfilter/xt_CONNTCINDEX.h
> @@ -0,0 +1,6 @@
> +#ifndef _XT_CONNTCINDEX_H_target
> +#define _XT_CONNTCINDEX_H_target
> +
> +#include <linux/netfilter/xt_conntcindex.h>
> +
> +#endif /*_XT_CONNTCINDEX_H_target*/
> diff --git a/include/uapi/linux/netfilter/xt_TCINDEX.h b/include/uapi/linux/netfilter/xt_TCINDEX.h
> new file mode 100644
> index 0000000..a35af8b
> --- /dev/null
> +++ b/include/uapi/linux/netfilter/xt_TCINDEX.h
> @@ -0,0 +1,6 @@
> +#ifndef _XT_TCINDEX_H_target
> +#define _XT_TCINDEX_H_target
> +
> +#include <linux/netfilter/xt_tcindex.h>
> +
> +#endif /*_XT_TCINDEX_H_target */
> diff --git a/include/uapi/linux/netfilter/xt_conntcindex.h b/include/uapi/linux/netfilter/xt_conntcindex.h
> new file mode 100644
> index 0000000..d82329a
> --- /dev/null
> +++ b/include/uapi/linux/netfilter/xt_conntcindex.h
> @@ -0,0 +1,31 @@
> +#ifndef _XT_CONNTCINDEX_H
> +#define _XT_CONNTCINDEX_H
> +
> +#include <linux/types.h>
> +
> +/* Copyright (C) 2015 Allied Telesis Labs NZ
> + * by Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +enum {
> +	XT_CONNTCINDEX_SET = 0,
> +	XT_CONNTCINDEX_SAVE,
> +	XT_CONNTCINDEX_RESTORE
> +};
> +
> +struct xt_conntcindex_tginfo1 {
> +	__u16 ctmark, ctmask, nfmask;
> +	__u8 mode;
> +};
> +
> +struct xt_conntcindex_mtinfo1 {
> +	__u16 mark, mask;
> +	__u8 invert;
> +};
> +
> +#endif /*_XT_CONNTCINDEX_H*/
> diff --git a/include/uapi/linux/netfilter/xt_tcindex.h b/include/uapi/linux/netfilter/xt_tcindex.h
> new file mode 100644
> index 0000000..cb012fa
> --- /dev/null
> +++ b/include/uapi/linux/netfilter/xt_tcindex.h
> @@ -0,0 +1,15 @@
> +#ifndef _XT_TCINDEX_H
> +#define _XT_TCINDEX_H
> +
> +#include <linux/types.h>
> +
> +struct xt_tcindex_tginfo1 {
> +	__u16 mark, mask;
> +};
> +
> +struct xt_tcindex_mtinfo1 {
> +	__u16 mark, mask;
> +	__u8 invert;
> +};
> +
> +#endif /*_XT_TCINDEX_H*/
> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 4692782..3bff4dd 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -603,6 +603,35 @@ config NETFILTER_XT_CONNMARK
>   	ctmark), similarly to the packet mark (nfmark). Using this
>   	target and match, you can set and match on this mark.
>   
> +config NETFILTER_XT_TCINDEX
> +	tristate 'tc_index mark target and match support'
> +	depends on NETFILTER_ADVANCED
> +	depends on NET_SCHED
> +	default n
> +	---help---
> +	This option adds the "TCINDEX" target and "tcindex" match.
> +
> +	tcindex matching allows you to match packets based on the "tc_index" value
> +	in the packet.
> +	The target allows you to create rules in the "mangle" table which alter
> +	the tc_index field associated with the packet.
> +
> +	This is an alternative to setting the tc_index field based on the priority
> +	fields of the incoming traffic.  In traffic control, this mark can be matched
> +	using the tcindex filter.
> +
> +config NETFILTER_XT_CONNTCINDEX
> +	tristate 'ct tc_index mark target and match support'
> +	depends on NF_CONNTRACK_MARK
> +	depends on NETFILTER_ADVANCED
> +	depends on NET_SCHED
> +	default n
> +	---help---
> +	This option adds the "CONNTCINDEX" target and "conntcindex" match.
> +
> +	Using this target and match, you can set and match on the tc_index mark in the
> +	connection.  This can also be saved/restored to the packet tc_index mark.
> +
>   config NETFILTER_XT_SET
>   	tristate 'set target and match support'
>   	depends on IP_SET
> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
> index 7638c36..7492ef6 100644
> --- a/net/netfilter/Makefile
> +++ b/net/netfilter/Makefile
> @@ -96,6 +96,8 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
>   # combos
>   obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
>   obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
> +obj-$(CONFIG_NETFILTER_XT_TCINDEX) += xt_tcindex.o
> +obj-$(CONFIG_NETFILTER_XT_CONNTCINDEX) += xt_conntcindex.o
>   obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
>   obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o
>   
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 9f52729..a1db545 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -314,6 +314,21 @@ nla_put_failure:
>   #define ctnetlink_dump_mark(a, b) (0)
>   #endif
>   
> +#if defined CONFIG_NET_SCHED && defined CONFIG_NF_CONNTRACK_MARK
> +static inline int
> +ctnetlink_dump_tcindex(struct sk_buff *skb, const struct nf_conn *ct)
> +{
> +	if (nla_put_be16(skb, CTA_TC_INDEX, htonl(ct->tc_index)))
> +		goto nla_put_failure;
> +	return 0;
> +
> +nla_put_failure:
> +	return -1;
> +}
> +#else
> +#define ctnetlink_dump_tcindex(a, b) (0)
> +#endif
> +
>   #ifdef CONFIG_NF_CONNTRACK_SECMARK
>   static inline int
>   ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
> @@ -521,6 +536,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
>   	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
>   	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
>   	    ctnetlink_dump_mark(skb, ct) < 0 ||
> +	    ctnetlink_dump_tcindex(skb, ct) < 0 ||
>   	    ctnetlink_dump_secctx(skb, ct) < 0 ||
>   	    ctnetlink_dump_labels(skb, ct) < 0 ||
>   	    ctnetlink_dump_id(skb, ct) < 0 ||
> @@ -749,7 +765,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
>   	if ((events & (1 << IPCT_MARK) || ct->mark)
>   	    && ctnetlink_dump_mark(skb, ct) < 0)
>   		goto nla_put_failure;
> -#endif
> +
> +#ifdef CONFIG_NET_SCHED
> +	if ((events & (1 << IPCT_TCINDEX) || ct->tc_index)
> +	    && ctnetlink_dump_tcindex(skb, ct) < 0)
> +		goto nla_put_failure;
> +#endif /* CONFIG_NET_SCHED */
> +#endif /* CONFIG_NF_CONNTRACK_MARK */
>   	rcu_read_unlock();
>   
>   	nlmsg_end(skb, nlh);
> @@ -1092,6 +1114,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
>   				    .len = NF_CT_LABELS_MAX_SIZE },
>   	[CTA_LABELS_MASK]	= { .type = NLA_BINARY,
>   				    .len = NF_CT_LABELS_MAX_SIZE },
> +	[CTA_TC_INDEX]		= { .type = NLA_U16 },
>   };
>   
>   static int ctnetlink_flush_conntrack(struct net *net,
> @@ -1697,7 +1720,13 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
>   #if defined(CONFIG_NF_CONNTRACK_MARK)
>   	if (cda[CTA_MARK])
>   		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
> -#endif
> +#ifdef CONFIG_NET_SCHED
> +	if (cda[CTA_TC_INDEX])
> +		ct->tc_index = ntohs(nla_get_be16(cda[CTA_TC_INDEX]));
> +#endif /* CONFIG_NET_SCHED */
> +#endif /* CONFIG_NF_CONNTRACK_MARK */
> +
> +
>   
>   	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
>   		err = ctnetlink_change_seq_adj(ct, cda);
> @@ -1824,7 +1853,11 @@ ctnetlink_create_conntrack(struct net *net,
>   #if defined(CONFIG_NF_CONNTRACK_MARK)
>   	if (cda[CTA_MARK])
>   		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
> -#endif
> +#ifdef CONFIG_NET_SCHED
> +	if (cda[CTA_TC_INDEX])
> +		ct->tc_index = ntohs(nla_get_be16(cda[CTA_TC_INDEX]));
> +#endif /* CONFIG_NET_SCHED */
> +#endif /* CONFIG_NF_CONNTRACK_MARK */
>   
>   	/* setup master conntrack: this is a confirmed expectation */
>   	if (cda[CTA_TUPLE_MASTER]) {
> diff --git a/net/netfilter/xt_conntcindex.c b/net/netfilter/xt_conntcindex.c
> new file mode 100644
> index 0000000..838dacf
> --- /dev/null
> +++ b/net/netfilter/xt_conntcindex.c
> @@ -0,0 +1,165 @@
> +/*
> + *	xt_conntcindex - Netfilter module to operate on connection tc_index marks
> + *
> + *	Copyright (C) 2015 Allied Telesis Labs NZ
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> + *
> + * Heavily based on xt_connmark.c
> + */
> +
> +#include <linux/module.h>
> +#include <linux/skbuff.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#include <net/netfilter/nf_conntrack_ecache.h>
> +#include <linux/netfilter/x_tables.h>
> +#include <linux/netfilter/xt_conntcindex.h>
> +
> +MODULE_AUTHOR("Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>");
> +MODULE_DESCRIPTION("Xtables: connection tc_index mark operations");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS("ipt_CONNTCINDEX");
> +MODULE_ALIAS("ip6t_CONNTCINDEX");
> +MODULE_ALIAS("ipt_conntcindex");
> +MODULE_ALIAS("ip6t_conntcindex");
> +
> +static unsigned int
> +conntcindex_tg(struct sk_buff *skb, const struct xt_action_param *par)
> +{
> +	const struct xt_conntcindex_tginfo1 *info = par->targinfo;
> +	enum ip_conntrack_info ctinfo;
> +	struct nf_conn *ct;
> +	u_int32_t newmark;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if (ct == NULL)
> +		return XT_CONTINUE;
> +
> +	switch (info->mode) {
> +	case XT_CONNTCINDEX_SET:
> +		newmark = (ct->tc_index & ~info->ctmask) ^ info->ctmark;
> +		if (ct->tc_index != newmark) {
> +			ct->tc_index = newmark;
> +			nf_conntrack_event_cache(IPCT_TCINDEX, ct);
> +		}
> +		break;
> +	case XT_CONNTCINDEX_SAVE:
> +		newmark = (ct->tc_index & ~info->ctmask) ^
> +		          (skb->tc_index & info->nfmask);
> +		if (ct->tc_index != newmark) {
> +			ct->tc_index = newmark;
> +			nf_conntrack_event_cache(IPCT_TCINDEX, ct);
> +		}
> +		break;
> +	case XT_CONNTCINDEX_RESTORE:
> +		newmark = (skb->tc_index & ~info->nfmask) ^
> +		          (ct->tc_index & info->ctmask);
> +		skb->tc_index = newmark;
> +		break;
> +	}
> +
> +	return XT_CONTINUE;
> +}
> +
> +static int conntcindex_tg_check(const struct xt_tgchk_param *par)
> +{
> +	int ret;
> +
> +	ret = nf_ct_l3proto_try_module_get(par->family);
> +	if (ret < 0)
> +		pr_info("cannot load conntrack support for proto=%u\n",
> +			par->family);
> +	return ret;
> +}
> +
> +static void conntcindex_tg_destroy(const struct xt_tgdtor_param *par)
> +{
> +	nf_ct_l3proto_module_put(par->family);
> +}
> +
> +static bool
> +conntcindex_mt(const struct sk_buff *skb, struct xt_action_param *par)
> +{
> +	const struct xt_conntcindex_mtinfo1 *info = par->matchinfo;
> +	enum ip_conntrack_info ctinfo;
> +	const struct nf_conn *ct;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if (ct == NULL)
> +		return false;
> +
> +	return ((ct->tc_index & info->mask) == info->mark) ^ info->invert;
> +}
> +
> +static int conntcindex_mt_check(const struct xt_mtchk_param *par)
> +{
> +	int ret;
> +
> +	ret = nf_ct_l3proto_try_module_get(par->family);
> +	if (ret < 0)
> +		pr_info("cannot load conntrack support for proto=%u\n",
> +			par->family);
> +	return ret;
> +}
> +
> +static void conntcindex_mt_destroy(const struct xt_mtdtor_param *par)
> +{
> +	nf_ct_l3proto_module_put(par->family);
> +}
> +
> +static struct xt_target conntcindex_tg_reg __read_mostly = {
> +	.name           = "CONNTCINDEX",
> +	.revision       = 1,
> +	.family         = NFPROTO_UNSPEC,
> +	.checkentry     = conntcindex_tg_check,
> +	.target         = conntcindex_tg,
> +	.targetsize     = sizeof(struct xt_conntcindex_tginfo1),
> +	.destroy        = conntcindex_tg_destroy,
> +	.me             = THIS_MODULE,
> +};
> +
> +static struct xt_match conntcindex_mt_reg __read_mostly = {
> +	.name           = "conntcindex",
> +	.revision       = 1,
> +	.family         = NFPROTO_UNSPEC,
> +	.checkentry     = conntcindex_mt_check,
> +	.match          = conntcindex_mt,
> +	.matchsize      = sizeof(struct xt_conntcindex_mtinfo1),
> +	.destroy        = conntcindex_mt_destroy,
> +	.me             = THIS_MODULE,
> +};
> +
> +static int __init conntcindex_mt_init(void)
> +{
> +	int ret;
> +
> +	ret = xt_register_target(&conntcindex_tg_reg);
> +	if (ret < 0)
> +		return ret;
> +	ret = xt_register_match(&conntcindex_mt_reg);
> +	if (ret < 0) {
> +		xt_unregister_target(&conntcindex_tg_reg);
> +		return ret;
> +	}
> +	return 0;
> +}
> +
> +static void __exit conntcindex_mt_exit(void)
> +{
> +	xt_unregister_match(&conntcindex_mt_reg);
> +	xt_unregister_target(&conntcindex_tg_reg);
> +}
> +
> +module_init(conntcindex_mt_init);
> +module_exit(conntcindex_mt_exit);
> diff --git a/net/netfilter/xt_tcindex.c b/net/netfilter/xt_tcindex.c
> new file mode 100644
> index 0000000..7de29cd
> --- /dev/null
> +++ b/net/netfilter/xt_tcindex.c
> @@ -0,0 +1,84 @@
> +/*
> + *	xt_tcindex - Netfilter module to match/tag on tc_index mark value
> + *
> + *	(C) 2015 Allied Telesis Labs NZ.
> +  *
> + *	This program is free software; you can redistribute it and/or modify
> + *	it under the terms of the GNU General Public License version 2 as
> + *	published by the Free Software Foundation.
> + *
> + *	Heavily based on xt_mark.c
> + */
> +
> +#include <linux/module.h>
> +#include <linux/skbuff.h>
> +
> +#include <linux/netfilter/xt_tcindex.h>
> +#include <linux/netfilter/x_tables.h>
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>");
> +MODULE_DESCRIPTION("Xtables: packet tc_index mark operations");
> +MODULE_ALIAS("ipt_tcindex");
> +MODULE_ALIAS("ip6t_tcindex");
> +MODULE_ALIAS("ipt_TCINDEX");
> +MODULE_ALIAS("ip6t_TCINDEX");
> +
> +static unsigned int
> +tcindex_tg(struct sk_buff *skb, const struct xt_action_param *par)
> +{
> +	const struct xt_tcindex_tginfo1 *info = par->targinfo;
> +
> +	skb->tc_index = (skb->tc_index & ~info->mask) ^ info->mark;
> +	return XT_CONTINUE;
> +}
> +
> +static bool
> +tcindex_mt(const struct sk_buff *skb, struct xt_action_param *par)
> +{
> +	const struct xt_tcindex_mtinfo1 *info = par->matchinfo;
> +
> +	return ((skb->tc_index & info->mask) == info->mark) ^ info->invert;
> +}
> +
> +static struct xt_target tcindex_tg_reg __read_mostly = {
> +	.name           = "TCINDEX",
> +	.revision       = 1,
> +	.family         = NFPROTO_UNSPEC,
> +	.target         = tcindex_tg,
> +	.targetsize     = sizeof(struct xt_tcindex_tginfo1),
> +	.me             = THIS_MODULE,
> +};
> +
> +static struct xt_match tcindex_mt_reg __read_mostly = {
> +	.name           = "tcindex",
> +	.revision       = 1,
> +	.family         = NFPROTO_UNSPEC,
> +	.match          = tcindex_mt,
> +	.matchsize      = sizeof(struct xt_tcindex_mtinfo1),
> +	.me             = THIS_MODULE,
> +};
> +
> +static int __init tcindex_mt_init(void)
> +{
> +	int ret;
> +
> +	ret = xt_register_target(&tcindex_tg_reg);
> +	if (ret < 0)
> +		return ret;
> +	ret = xt_register_match(&tcindex_mt_reg);
> +	if (ret < 0) {
> +		xt_unregister_target(&tcindex_tg_reg);
> +		return ret;
> +	}
> +	return 0;
> +}
> +
> +static void __exit tcindex_mt_exit(void)
> +{
> +	xt_unregister_match(&tcindex_mt_reg);
> +	xt_unregister_target(&tcindex_tg_reg);
> +}
> +
> +module_init(tcindex_mt_init);
> +module_exit(tcindex_mt_exit);
Florian Westphal Dec. 6, 2015, 10:45 p.m. UTC | #2
Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
> Hi All,
> 
> I'm still hoping for some feedback on this.  I have some userspace 
> patches around this as well, (to set/show the tc_index in the 
> connection, and to add the marking/matching rules in iptables), but I am 
> holding off on sending them until I know what people think of this 
> idea/implementation first.

I can't say for sure since I don't know enough about tc.

However, AFAICS tc_index seems to be something that should be internal
to tc and not exposed/changeable via iptables.

> Basically it allows 16 bits of marking in skb and connmark for traffic
> control purposes using an existing field in the skb.

Why not extend cls_flow to allow matching ctmark directly via tc
filters instead of requiring conntrack->foo copy to skb->foo?

We also have -j CLASSIFY to set skb->priority and at least cls_flow
seems to be able to match on that (did not test it).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luuk Paulussen Dec. 7, 2015, 2:19 a.m. UTC | #3
On 12/07/2015 11:45 AM, Florian Westphal wrote:
> Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
>> Hi All,
>>
>> I'm still hoping for some feedback on this.  I have some userspace
>> patches around this as well, (to set/show the tc_index in the
>> connection, and to add the marking/matching rules in iptables), but I am
>> holding off on sending them until I know what people think of this
>> idea/implementation first.
> I can't say for sure since I don't know enough about tc.
>
> However, AFAICS tc_index seems to be something that should be internal
> to tc and not exposed/changeable via iptables.
tc_index is a mark that can be set by certain configurable ingress 
schedulers (dsmark, GRED, ingress) for later classification via the 
tcindex classifer.  This just adds an alternative mechanism for setting 
this mark if those schedulers aren't being used.
* dsmark sets the tc_index value based on the incoming DSCP value
* ingress sets the tc_index value based on other rules (e.g. mark set 
via iptables)
* New code sets tc_index directly based on iptables classification or 
restoring saved value.
>
>> Basically it allows 16 bits of marking in skb and connmark for traffic
>> control purposes using an existing field in the skb.
> Why not extend cls_flow to allow matching ctmark directly via tc
> filters instead of requiring conntrack->foo copy to skb->foo?
The flow classifier doesn't have support for masks on the mark (or other 
fields), so doesn't provide enough control to differentiate between 
outgoing/incoming traffic.  Also, do all packets have an associated 
connection? if we restore the connection mark to the packet mark with a 
mask, then tc will only see the marking that we want it to see, and 
packets that don't have an associated connection will be matched to 
other rules.  Also, the tc_index and fw filters already exist for the 
skb fields.
>
> We also have -j CLASSIFY to set skb->priority and at least cls_flow
> seems to be able to match on that (did not test it).
The functionality we are trying to achieve (for performance reasons) is 
as follows:
1st packet in flow in each direction (slow path):
- Go through list of classification rules and set something (packet 
class) in a connection mark (with a mask) for traffic on this flow in 
this direction.
Other packets in flow in this direction (fast path):
- Restore the part of the mark for this direction from connection and go 
to egress where tc rules can do correct traffic control based on the 
restored mark.

I think that CLASSIFY can only really go through the slow path.

We also want to make use of hierarchical qdiscs, which requires that the 
packet steps through levels of qdiscs, being filtered/classified to the 
correct class at each level, which I think means that priority field 
isn't appropriate, as it might change at different steps.--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal Dec. 7, 2015, 3:05 a.m. UTC | #4
Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
> On 12/07/2015 11:45 AM, Florian Westphal wrote:
> > Why not extend cls_flow to allow matching ctmark directly via tc
> > filters instead of requiring conntrack->foo copy to skb->foo?

> The flow classifier doesn't have support for masks on the mark (or other 
> fields), so doesn't provide enough control to differentiate between 
> outgoing/incoming traffic.

Hmm, add it?  Another alternative would be to extend em_meta with
conntrack collectors.

Could also add ability to match on conntrack direction.

> Also, do all packets have an associated
> connection?

No, but its pretty much the same as skb->mark == 0, no?
(i.e. you could still fall back to some other clssification method?)

> > We also have -j CLASSIFY to set skb->priority and at least cls_flow
> > seems to be able to match on that (did not test it).
> The functionality we are trying to achieve (for performance reasons) is 
> as follows:

Thanks for explaining.

> 1st packet in flow in each direction (slow path):

"each direction?"  Does that mean that you might have two distinct
results based on wheter skb is incoming or outgoing?
Or does it mean "conntrack direction"?

How is that handled in the fastpath? ctmark splitted in two halves + masking?

What about:

-m connlabel ! --label classify_in -m conntrack --ctdir ORIGINAL -j IN_CLASSIFIERS
-m connlabel ! --label classify_rep -m conntrack --ctdir REPLY -j REP_CLASSIFIERS
(restore nfmark based on connmark here if needed)

This means three tests & no repeat of "slowpath" after 1st packet even
if ctmark remains at 0.

(Alternative is to disallow a 0 ctmark and just use -m connmark --mark 0
 -j SLOWAPATH)

> - Go through list of classification rules and set something (packet 
> class) in a connection mark (with a mask) for traffic on this flow in 
> this direction.

-A IN_CLASSIFIERS --label classify_in --set
-A IN_CLASSIFIERS -m $magic -j CONNMARK ... 42/$mask
/* more slowpath rules */

To reduce rule traversal you can do this:
-N CLASS_FOO
-A CLASS_FOO -j CONNMARK ...
-A IN_CLASSIFIERS $magic --goto CLASS_FOO

Obviously that requires custom chain for each class, but it means
slowpath chain returns early once a decision was made.

> Other packets in flow in this direction (fast path):
> - Restore the part of the mark for this direction from connection and go 
> to egress where tc rules can do correct traffic control based on the 
> restored mark.

If we could read ctmark from tc classifiers it seems this would not be
needed since you could just classify based on the conntrack mark with
fallback for skbs without conntrack entry (you'd also need to handle
this case with the proposed "restore nfmark" scheme since the nfmark
would be 0 (since nothing could be restored).

> I think that CLASSIFY can only really go through the slow path.

Right, xtables architecture limitation :-(
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luuk Paulussen Dec. 7, 2015, 4:24 a.m. UTC | #5
On 12/07/2015 04:05 PM, Florian Westphal wrote:
> Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
>> On 12/07/2015 11:45 AM, Florian Westphal wrote:
>>> Why not extend cls_flow to allow matching ctmark directly via tc
>>> filters instead of requiring conntrack->foo copy to skb->foo?
>> The flow classifier doesn't have support for masks on the mark (or other
>> fields), so doesn't provide enough control to differentiate between
>> outgoing/incoming traffic.
> Hmm, add it?  Another alternative would be to extend em_meta with
> conntrack collectors.
>
> Could also add ability to match on conntrack direction.
I think that the flow classifier is mainly designed for leaf classes.  
It isn't really designed to work with more complex or hierarchical 
qdiscs as it just maps/hashes to a minor class number, so I don't think 
we can use it.
As per other comments further down, we'd prefer to be have already 
tagged the packet once we get to tc, which is already fully supported 
with fw/tcindex classifiers.
>> Also, do all packets have an associated
>> connection?
> No, but its pretty much the same as skb->mark == 0, no?
> (i.e. you could still fall back to some other clssification method?)
Our hope/plan is to make the classification decision using xtables (fast 
path falling back to slow path), which breaks if we have to reclassify 
at tc stage.
>
>>> We also have -j CLASSIFY to set skb->priority and at least cls_flow
>>> seems to be able to match on that (did not test it).
>> The functionality we are trying to achieve (for performance reasons) is
>> as follows:
> Thanks for explaining.
>
>> 1st packet in flow in each direction (slow path):
> "each direction?"  Does that mean that you might have two distinct
> results based on wheter skb is incoming or outgoing?
> Or does it mean "conntrack direction"?
>
> How is that handled in the fastpath? ctmark splitted in two halves + masking?
Both actually
  - mark is split into two halves with masking to differentiate based on 
conntrack direction, and then we also have to account for 
incoming/outgoing by installing two rules to match either conntrack 
direction at tc stage.  Generally traffic control is more relevant going 
out onto a WAN link than coming back in, so the rules are different in 
the two directions, but certain flows might originate from the WAN side.
>
> What about:
>
> -m connlabel ! --label classify_in -m conntrack --ctdir ORIGINAL -j IN_CLASSIFIERS
> -m connlabel ! --label classify_rep -m conntrack --ctdir REPLY -j REP_CLASSIFIERS
> (restore nfmark based on connmark here if needed)
>
> This means three tests & no repeat of "slowpath" after 1st packet even
> if ctmark remains at 0.
This could be a useful idea.  It does break in the (reasonably 
infrequent) case where the configuration is changed and the connection 
still exists.  Our plan in that case was to set all the connection marks 
back to 0 rather than deleting the connections.
>
> (Alternative is to disallow a 0 ctmark and just use -m connmark --mark 0
>   -j SLOWAPATH)
This alternative is what we are currently doing. (Non-matching packets 
are marked with a default mark)
>
>> - Go through list of classification rules and set something (packet
>> class) in a connection mark (with a mask) for traffic on this flow in
>> this direction.
> -A IN_CLASSIFIERS --label classify_in --set
> -A IN_CLASSIFIERS -m $magic -j CONNMARK ... 42/$mask
> /* more slowpath rules */
>
> To reduce rule traversal you can do this:
> -N CLASS_FOO
> -A CLASS_FOO -j CONNMARK ...
> -A IN_CLASSIFIERS $magic --goto CLASS_FOO
>
> Obviously that requires custom chain for each class, but it means
> slowpath chain returns early once a decision was made.
We are using techniques like this to reduce rule traversal.  We have a 
reasonably good grasp of what is needed in terms of rules, we just need 
more space...
>> Other packets in flow in this direction (fast path):
>> - Restore the part of the mark for this direction from connection and go
>> to egress where tc rules can do correct traffic control based on the
>> restored mark.
> If we could read ctmark from tc classifiers it seems this would not be
> needed since you could just classify based on the conntrack mark with
> fallback for skbs without conntrack entry (you'd also need to handle
> this case with the proposed "restore nfmark" scheme since the nfmark
> would be 0 (since nothing could be restored).
The idea is that if the fast path fails, the packet falls back to the 
slow path and is marked  (as well as the connection being updated if 
appropriate).  The goal is that no packet gets through the netfilter 
processing without some kind of mark being set. Otherwise you end up 
having to do the full classification in two places (xtables and tc) for 
packets with no connection.

The key point here is that the only reason that we are proposing this 
change is that the existing ct_mark doesn't have enough space left 
alongside other features, so using CONNMARK isn't an option (which is 
where CONNTCINDEX comes in).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Dec. 9, 2015, 9:07 a.m. UTC | #6
On 12/07/2015 03:19 AM, Luuk Paulussen wrote:
> On 12/07/2015 11:45 AM, Florian Westphal wrote:
>> Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
>>> Hi All,
>>>
>>> I'm still hoping for some feedback on this.  I have some userspace
>>> patches around this as well, (to set/show the tc_index in the
>>> connection, and to add the marking/matching rules in iptables), but I am
>>> holding off on sending them until I know what people think of this
>>> idea/implementation first.
>> I can't say for sure since I don't know enough about tc.
>>
>> However, AFAICS tc_index seems to be something that should be internal
>> to tc and not exposed/changeable via iptables.
> tc_index is a mark that can be set by certain configurable ingress
> schedulers (dsmark, GRED, ingress) for later classification via the
> tcindex classifer.  This just adds an alternative mechanism for setting
> this mark if those schedulers aren't being used.

Fwiw, tc_index can be read/written by cls_bpf (and you can also apply masks
on that field if needed).

> * dsmark sets the tc_index value based on the incoming DSCP value
> * ingress sets the tc_index value based on other rules (e.g. mark set
> via iptables)
> * New code sets tc_index directly based on iptables classification or
> restoring saved value.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luuk Paulussen Dec. 13, 2015, 11 p.m. UTC | #7
On 12/09/2015 10:07 PM, Daniel Borkmann wrote:
> On 12/07/2015 03:19 AM, Luuk Paulussen wrote:
>> On 12/07/2015 11:45 AM, Florian Westphal wrote:
>>> Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
>>>> Hi All,
>>>>
>>>> I'm still hoping for some feedback on this.  I have some userspace
>>>> patches around this as well, (to set/show the tc_index in the
>>>> connection, and to add the marking/matching rules in iptables), but 
>>>> I am
>>>> holding off on sending them until I know what people think of this
>>>> idea/implementation first.
>>> I can't say for sure since I don't know enough about tc.
>>>
>>> However, AFAICS tc_index seems to be something that should be internal
>>> to tc and not exposed/changeable via iptables.
>> tc_index is a mark that can be set by certain configurable ingress
>> schedulers (dsmark, GRED, ingress) for later classification via the
>> tcindex classifer.  This just adds an alternative mechanism for setting
>> this mark if those schedulers aren't being used.
>
> Fwiw, tc_index can be read/written by cls_bpf (and you can also apply 
> masks
> on that field if needed).
I've just been looking into this and it does seem like it might cover a 
small part of what we are trying to do, although misses the key part, 
which is to use connection tracking information to limit full processing 
to the first packet of a flow in each direction. I'm guessing that there 
isn't any bpf support for connection information?

One thing that isn't quite clear to me. Is it possible to use xt_bfp.c 
to set the tc_index field from netfilter?  If this is the case, then it 
does set a precedent
for being able to set this value outside of tc code (but sill misses the 
save/restore possibility).

Given that tc_index is simple metadata I'm guessing that filter 
performance over the tcindex classifier wouldn't be significantly better?

>> * dsmark sets the tc_index value based on the incoming DSCP value
>> * ingress sets the tc_index value based on other rules (e.g. mark set
>> via iptables)
>> * New code sets tc_index directly based on iptables classification or
>> restoring saved value.

I'm still looking for an overall idea around whether this patch has a 
chance of being accepted for the kernel.  It feels like none of the 
comments or proposed ideas have addressed the issues that the patch is 
addressing:
1. Save/restore functionality of mark/connmark can significantly 
increase performance for larger rule sets, so is desirable for 
performance reasons.
2. Insufficient space in skb nf mark and connection mark for all 
applications that might want to use it.
3. tc being one of the users of nf mark (via fw filter) has a logical 
alternative in the 16 bit tc_index field, which could be used without 
increasing SKB size.  This doesn't currently have a match/tag target in 
netfilter or an analogue in the connection for save/restore.  It does 
however have a pre-existing classifier in tc code.

So this patch adds tc_index field to the connection and 
match/tag/save/restore targets to netfilter, allowing marking packets 
for tc into this field and save/restore from the connection.--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Dec. 14, 2015, 9:50 a.m. UTC | #8
On 12/14/2015 12:00 AM, Luuk Paulussen wrote:
> On 12/09/2015 10:07 PM, Daniel Borkmann wrote:
>> On 12/07/2015 03:19 AM, Luuk Paulussen wrote:
>>> On 12/07/2015 11:45 AM, Florian Westphal wrote:
>>>> Luuk Paulussen <Luuk.Paulussen@alliedtelesis.co.nz> wrote:
>>>>> Hi All,
>>>>>
>>>>> I'm still hoping for some feedback on this.  I have some userspace
>>>>> patches around this as well, (to set/show the tc_index in the
>>>>> connection, and to add the marking/matching rules in iptables), but
>>>>> I am
>>>>> holding off on sending them until I know what people think of this
>>>>> idea/implementation first.
>>>> I can't say for sure since I don't know enough about tc.
>>>>
>>>> However, AFAICS tc_index seems to be something that should be internal
>>>> to tc and not exposed/changeable via iptables.
>>> tc_index is a mark that can be set by certain configurable ingress
>>> schedulers (dsmark, GRED, ingress) for later classification via the
>>> tcindex classifer.  This just adds an alternative mechanism for setting
>>> this mark if those schedulers aren't being used.
>>
>> Fwiw, tc_index can be read/written by cls_bpf (and you can also apply
>> masks
>> on that field if needed).
> I've just been looking into this and it does seem like it might cover a
> small part of what we are trying to do, although misses the key part,
> which is to use connection tracking information to limit full processing
> to the first packet of a flow in each direction. I'm guessing that there
> isn't any bpf support for connection information?

Depends on your requirements, but you could probably implement a minimal
tracker in cls_bpf through eBPF itself. Or alternatively, add a helper
function to retrieve the cttuple? Florian mentioned cls_flow already, so
you could orientate yourself there wrt developing an eBPF helper.

> One thing that isn't quite clear to me. Is it possible to use xt_bfp.c
> to set the tc_index field from netfilter?  If this is the case, then it
> does set a precedent
> for being able to set this value outside of tc code (but sill misses the
> save/restore possibility).

xt_bpf covers only classic BPF, so it's unfortunately not possible to
match or set tc_index from there.

> Given that tc_index is simple metadata I'm guessing that filter
> performance over the tcindex classifier wouldn't be significantly better?

I think it depends on how you encode your information, f.e. if you can get
away w/o an index to classid mapping through a hash table, etc. Maybe best
to give it a try on benchmarking.

Cheers,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index fde4068..9b0ab48 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -105,7 +105,11 @@  struct nf_conn {
 
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	u_int32_t mark;
-#endif
+
+#ifdef CONFIG_NET_SCHED
+	u_int16_t tc_index;
+#endif /* CONFIG_NET_SCHED */
+#endif /* CONFIG_NF_CONNTRACK_MARK */
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 	u_int32_t secmark;
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 1d973d2..fedaaab 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -22,6 +22,7 @@  header-y += xt_CHECKSUM.h
 header-y += xt_CLASSIFY.h
 header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
+header-y += xt_CONNTCINDEX.h
 header-y += xt_CT.h
 header-y += xt_DSCP.h
 header-y += xt_HMARK.h
@@ -33,6 +34,7 @@  header-y += xt_NFLOG.h
 header-y += xt_NFQUEUE.h
 header-y += xt_RATEEST.h
 header-y += xt_SECMARK.h
+header-y += xt_TCINDEX.h
 header-y += xt_TCPMSS.h
 header-y += xt_TCPOPTSTRIP.h
 header-y += xt_TEE.h
@@ -46,6 +48,7 @@  header-y += xt_connbytes.h
 header-y += xt_connlabel.h
 header-y += xt_connlimit.h
 header-y += xt_connmark.h
+header-y += xt_conntcindex.h
 header-y += xt_conntrack.h
 header-y += xt_cpu.h
 header-y += xt_dccp.h
@@ -81,6 +84,7 @@  header-y += xt_socket.h
 header-y += xt_state.h
 header-y += xt_statistic.h
 header-y += xt_string.h
+header-y += xt_tcindex.h
 header-y += xt_tcpmss.h
 header-y += xt_tcpudp.h
 header-y += xt_time.h
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f471..b211bb8 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -107,6 +107,7 @@  enum ip_conntrack_events {
 	IPCT_NATSEQADJ = IPCT_SEQADJ,
 	IPCT_SECMARK,		/* new security mark has been set */
 	IPCT_LABEL,		/* new connlabel has been set */
+	IPCT_TCINDEX,		/* new tc_index has been set */
 };
 
 enum ip_conntrack_expect_events {
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index c1a4e144..cfdd15f 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -53,6 +53,7 @@  enum ctattr_type {
 	CTA_MARK_MASK,
 	CTA_LABELS,
 	CTA_LABELS_MASK,
+	CTA_TC_INDEX,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
diff --git a/include/uapi/linux/netfilter/xt_CONNTCINDEX.h b/include/uapi/linux/netfilter/xt_CONNTCINDEX.h
new file mode 100644
index 0000000..96eccf3
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_CONNTCINDEX.h
@@ -0,0 +1,6 @@ 
+#ifndef _XT_CONNTCINDEX_H_target
+#define _XT_CONNTCINDEX_H_target
+
+#include <linux/netfilter/xt_conntcindex.h>
+
+#endif /*_XT_CONNTCINDEX_H_target*/
diff --git a/include/uapi/linux/netfilter/xt_TCINDEX.h b/include/uapi/linux/netfilter/xt_TCINDEX.h
new file mode 100644
index 0000000..a35af8b
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_TCINDEX.h
@@ -0,0 +1,6 @@ 
+#ifndef _XT_TCINDEX_H_target
+#define _XT_TCINDEX_H_target
+
+#include <linux/netfilter/xt_tcindex.h>
+
+#endif /*_XT_TCINDEX_H_target */
diff --git a/include/uapi/linux/netfilter/xt_conntcindex.h b/include/uapi/linux/netfilter/xt_conntcindex.h
new file mode 100644
index 0000000..d82329a
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_conntcindex.h
@@ -0,0 +1,31 @@ 
+#ifndef _XT_CONNTCINDEX_H
+#define _XT_CONNTCINDEX_H
+
+#include <linux/types.h>
+
+/* Copyright (C) 2015 Allied Telesis Labs NZ
+ * by Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+enum {
+	XT_CONNTCINDEX_SET = 0,
+	XT_CONNTCINDEX_SAVE,
+	XT_CONNTCINDEX_RESTORE
+};
+
+struct xt_conntcindex_tginfo1 {
+	__u16 ctmark, ctmask, nfmask;
+	__u8 mode;
+};
+
+struct xt_conntcindex_mtinfo1 {
+	__u16 mark, mask;
+	__u8 invert;
+};
+
+#endif /*_XT_CONNTCINDEX_H*/
diff --git a/include/uapi/linux/netfilter/xt_tcindex.h b/include/uapi/linux/netfilter/xt_tcindex.h
new file mode 100644
index 0000000..cb012fa
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_tcindex.h
@@ -0,0 +1,15 @@ 
+#ifndef _XT_TCINDEX_H
+#define _XT_TCINDEX_H
+
+#include <linux/types.h>
+
+struct xt_tcindex_tginfo1 {
+	__u16 mark, mask;
+};
+
+struct xt_tcindex_mtinfo1 {
+	__u16 mark, mask;
+	__u8 invert;
+};
+
+#endif /*_XT_TCINDEX_H*/
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4692782..3bff4dd 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -603,6 +603,35 @@  config NETFILTER_XT_CONNMARK
 	ctmark), similarly to the packet mark (nfmark). Using this
 	target and match, you can set and match on this mark.
 
+config NETFILTER_XT_TCINDEX
+	tristate 'tc_index mark target and match support'
+	depends on NETFILTER_ADVANCED
+	depends on NET_SCHED
+	default n
+	---help---
+	This option adds the "TCINDEX" target and "tcindex" match.
+
+	tcindex matching allows you to match packets based on the "tc_index" value
+	in the packet.
+	The target allows you to create rules in the "mangle" table which alter
+	the tc_index field associated with the packet.
+
+	This is an alternative to setting the tc_index field based on the priority
+	fields of the incoming traffic.  In traffic control, this mark can be matched
+	using the tcindex filter.
+
+config NETFILTER_XT_CONNTCINDEX
+	tristate 'ct tc_index mark target and match support'
+	depends on NF_CONNTRACK_MARK
+	depends on NETFILTER_ADVANCED
+	depends on NET_SCHED
+	default n
+	---help---
+	This option adds the "CONNTCINDEX" target and "conntcindex" match.
+
+	Using this target and match, you can set and match on the tc_index mark in the
+	connection.  This can also be saved/restored to the packet tc_index mark.
+
 config NETFILTER_XT_SET
 	tristate 'set target and match support'
 	depends on IP_SET
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7638c36..7492ef6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -96,6 +96,8 @@  obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 # combos
 obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_TCINDEX) += xt_tcindex.o
+obj-$(CONFIG_NETFILTER_XT_CONNTCINDEX) += xt_conntcindex.o
 obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
 obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9f52729..a1db545 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -314,6 +314,21 @@  nla_put_failure:
 #define ctnetlink_dump_mark(a, b) (0)
 #endif
 
+#if defined CONFIG_NET_SCHED && defined CONFIG_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_tcindex(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	if (nla_put_be16(skb, CTA_TC_INDEX, htonl(ct->tc_index)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_tcindex(a, b) (0)
+#endif
+
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 static inline int
 ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
@@ -521,6 +536,7 @@  ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
+	    ctnetlink_dump_tcindex(skb, ct) < 0 ||
 	    ctnetlink_dump_secctx(skb, ct) < 0 ||
 	    ctnetlink_dump_labels(skb, ct) < 0 ||
 	    ctnetlink_dump_id(skb, ct) < 0 ||
@@ -749,7 +765,13 @@  ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	if ((events & (1 << IPCT_MARK) || ct->mark)
 	    && ctnetlink_dump_mark(skb, ct) < 0)
 		goto nla_put_failure;
-#endif
+
+#ifdef CONFIG_NET_SCHED
+	if ((events & (1 << IPCT_TCINDEX) || ct->tc_index)
+	    && ctnetlink_dump_tcindex(skb, ct) < 0)
+		goto nla_put_failure;
+#endif /* CONFIG_NET_SCHED */
+#endif /* CONFIG_NF_CONNTRACK_MARK */
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
@@ -1092,6 +1114,7 @@  static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 				    .len = NF_CT_LABELS_MAX_SIZE },
 	[CTA_LABELS_MASK]	= { .type = NLA_BINARY,
 				    .len = NF_CT_LABELS_MAX_SIZE },
+	[CTA_TC_INDEX]		= { .type = NLA_U16 },
 };
 
 static int ctnetlink_flush_conntrack(struct net *net,
@@ -1697,7 +1720,13 @@  ctnetlink_change_conntrack(struct nf_conn *ct,
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	if (cda[CTA_MARK])
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
-#endif
+#ifdef CONFIG_NET_SCHED
+	if (cda[CTA_TC_INDEX])
+		ct->tc_index = ntohs(nla_get_be16(cda[CTA_TC_INDEX]));
+#endif /* CONFIG_NET_SCHED */
+#endif /* CONFIG_NF_CONNTRACK_MARK */
+
+
 
 	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
 		err = ctnetlink_change_seq_adj(ct, cda);
@@ -1824,7 +1853,11 @@  ctnetlink_create_conntrack(struct net *net,
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	if (cda[CTA_MARK])
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
-#endif
+#ifdef CONFIG_NET_SCHED
+	if (cda[CTA_TC_INDEX])
+		ct->tc_index = ntohs(nla_get_be16(cda[CTA_TC_INDEX]));
+#endif /* CONFIG_NET_SCHED */
+#endif /* CONFIG_NF_CONNTRACK_MARK */
 
 	/* setup master conntrack: this is a confirmed expectation */
 	if (cda[CTA_TUPLE_MASTER]) {
diff --git a/net/netfilter/xt_conntcindex.c b/net/netfilter/xt_conntcindex.c
new file mode 100644
index 0000000..838dacf
--- /dev/null
+++ b/net/netfilter/xt_conntcindex.c
@@ -0,0 +1,165 @@ 
+/*
+ *	xt_conntcindex - Netfilter module to operate on connection tc_index marks
+ *
+ *	Copyright (C) 2015 Allied Telesis Labs NZ
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Heavily based on xt_connmark.c
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_conntcindex.h>
+
+MODULE_AUTHOR("Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>");
+MODULE_DESCRIPTION("Xtables: connection tc_index mark operations");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_CONNTCINDEX");
+MODULE_ALIAS("ip6t_CONNTCINDEX");
+MODULE_ALIAS("ipt_conntcindex");
+MODULE_ALIAS("ip6t_conntcindex");
+
+static unsigned int
+conntcindex_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_conntcindex_tginfo1 *info = par->targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	u_int32_t newmark;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return XT_CONTINUE;
+
+	switch (info->mode) {
+	case XT_CONNTCINDEX_SET:
+		newmark = (ct->tc_index & ~info->ctmask) ^ info->ctmark;
+		if (ct->tc_index != newmark) {
+			ct->tc_index = newmark;
+			nf_conntrack_event_cache(IPCT_TCINDEX, ct);
+		}
+		break;
+	case XT_CONNTCINDEX_SAVE:
+		newmark = (ct->tc_index & ~info->ctmask) ^
+		          (skb->tc_index & info->nfmask);
+		if (ct->tc_index != newmark) {
+			ct->tc_index = newmark;
+			nf_conntrack_event_cache(IPCT_TCINDEX, ct);
+		}
+		break;
+	case XT_CONNTCINDEX_RESTORE:
+		newmark = (skb->tc_index & ~info->nfmask) ^
+		          (ct->tc_index & info->ctmask);
+		skb->tc_index = newmark;
+		break;
+	}
+
+	return XT_CONTINUE;
+}
+
+static int conntcindex_tg_check(const struct xt_tgchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void conntcindex_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static bool
+conntcindex_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntcindex_mtinfo1 *info = par->matchinfo;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return false;
+
+	return ((ct->tc_index & info->mask) == info->mark) ^ info->invert;
+}
+
+static int conntcindex_mt_check(const struct xt_mtchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void conntcindex_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target conntcindex_tg_reg __read_mostly = {
+	.name           = "CONNTCINDEX",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.checkentry     = conntcindex_tg_check,
+	.target         = conntcindex_tg,
+	.targetsize     = sizeof(struct xt_conntcindex_tginfo1),
+	.destroy        = conntcindex_tg_destroy,
+	.me             = THIS_MODULE,
+};
+
+static struct xt_match conntcindex_mt_reg __read_mostly = {
+	.name           = "conntcindex",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.checkentry     = conntcindex_mt_check,
+	.match          = conntcindex_mt,
+	.matchsize      = sizeof(struct xt_conntcindex_mtinfo1),
+	.destroy        = conntcindex_mt_destroy,
+	.me             = THIS_MODULE,
+};
+
+static int __init conntcindex_mt_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&conntcindex_tg_reg);
+	if (ret < 0)
+		return ret;
+	ret = xt_register_match(&conntcindex_mt_reg);
+	if (ret < 0) {
+		xt_unregister_target(&conntcindex_tg_reg);
+		return ret;
+	}
+	return 0;
+}
+
+static void __exit conntcindex_mt_exit(void)
+{
+	xt_unregister_match(&conntcindex_mt_reg);
+	xt_unregister_target(&conntcindex_tg_reg);
+}
+
+module_init(conntcindex_mt_init);
+module_exit(conntcindex_mt_exit);
diff --git a/net/netfilter/xt_tcindex.c b/net/netfilter/xt_tcindex.c
new file mode 100644
index 0000000..7de29cd
--- /dev/null
+++ b/net/netfilter/xt_tcindex.c
@@ -0,0 +1,84 @@ 
+/*
+ *	xt_tcindex - Netfilter module to match/tag on tc_index mark value
+ *
+ *	(C) 2015 Allied Telesis Labs NZ.
+  *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ *
+ *	Heavily based on xt_mark.c
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/xt_tcindex.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Luuk Paulussen <luuk.paulussen@alliedtelesis.co.nz>");
+MODULE_DESCRIPTION("Xtables: packet tc_index mark operations");
+MODULE_ALIAS("ipt_tcindex");
+MODULE_ALIAS("ip6t_tcindex");
+MODULE_ALIAS("ipt_TCINDEX");
+MODULE_ALIAS("ip6t_TCINDEX");
+
+static unsigned int
+tcindex_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tcindex_tginfo1 *info = par->targinfo;
+
+	skb->tc_index = (skb->tc_index & ~info->mask) ^ info->mark;
+	return XT_CONTINUE;
+}
+
+static bool
+tcindex_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_tcindex_mtinfo1 *info = par->matchinfo;
+
+	return ((skb->tc_index & info->mask) == info->mark) ^ info->invert;
+}
+
+static struct xt_target tcindex_tg_reg __read_mostly = {
+	.name           = "TCINDEX",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.target         = tcindex_tg,
+	.targetsize     = sizeof(struct xt_tcindex_tginfo1),
+	.me             = THIS_MODULE,
+};
+
+static struct xt_match tcindex_mt_reg __read_mostly = {
+	.name           = "tcindex",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.match          = tcindex_mt,
+	.matchsize      = sizeof(struct xt_tcindex_mtinfo1),
+	.me             = THIS_MODULE,
+};
+
+static int __init tcindex_mt_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&tcindex_tg_reg);
+	if (ret < 0)
+		return ret;
+	ret = xt_register_match(&tcindex_mt_reg);
+	if (ret < 0) {
+		xt_unregister_target(&tcindex_tg_reg);
+		return ret;
+	}
+	return 0;
+}
+
+static void __exit tcindex_mt_exit(void)
+{
+	xt_unregister_match(&tcindex_mt_reg);
+	xt_unregister_target(&tcindex_tg_reg);
+}
+
+module_init(tcindex_mt_init);
+module_exit(tcindex_mt_exit);