diff mbox series

[nf-next] netfilter: Add native tproxy support for nf_tables

Message ID 20180620104130.2217-2-ecklm94@gmail.com
State Changes Requested
Delegated to: Pablo Neira
Headers show
Series [nf-next] netfilter: Add native tproxy support for nf_tables | expand

Commit Message

Máté Eckl June 20, 2018, 10:41 a.m. UTC
This patch is built on the commit not applied yet with the title:
	netfilter: Move nf_tproxy_assign_sock to nf_tproxy.h

-- 8< --
A great portion of the code is taken from xt_TPROXY.c

There are some changes compared to the iptables implementation:
 - tproxy statement is not terminal here
 - no transport protocol criterion is necessary to set target ip address

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
---
 include/uapi/linux/netfilter/nf_tables.h |  16 ++
 net/netfilter/Kconfig                    |   8 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_tproxy.c               | 297 +++++++++++++++++++++++
 4 files changed, 322 insertions(+)
 create mode 100644 net/netfilter/nft_tproxy.c

Comments

Florian Westphal June 20, 2018, 11:36 a.m. UTC | #1
Máté Eckl <ecklm94@gmail.com> wrote:
> There are some changes compared to the iptables implementation:
>  - tproxy statement is not terminal here
>  - no transport protocol criterion is necessary to set target ip address

> +	const struct nft_tproxy *priv = nft_expr_priv(expr);
> +	struct sk_buff *skb = pkt->skb;
> +	struct sock *sk = skb->sk;
> +	const struct iphdr *iph = ip_hdr(skb);
> +	struct udphdr _hdr, *hp;
> +	__be32 taddr = 0;
> +	__be16 tport = 0;
> +
> +	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
> +	if (!hp)
> +		regs->verdict.code = NFT_BREAK;

This is missing needed 'return'.

> +	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
> +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> +		/* reopening a TIME_WAIT connection needs special handling */
> +		sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
> +	else if (!sk)
> +		/* no, there's no established connection, check if
> +		 * there's a listener on the redirected addr/port */
> +		sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
> +					   iph->saddr, taddr,
> +					   hp->source, tport,
> +					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);

This looks like its subtly broken, inherited from xt_TPROXY.
Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
assumes it gets tcphdr (it checks th->doff, and that might be garbage).

So I suggest to remove 'hp' argument from nf_tproxy_get_sock_v4/6 and repeat
the skb_header_pointer() call there, using struct tcphdr size as backend
storage for TCP case.

This will need to be a extra patch vs. nf.git tree.

> +	if (sk && nf_tproxy_sk_is_transparent(sk)) {
> +		nf_tproxy_assign_sock(skb, sk);
> +	}

No need for extra { }, see scripts/checkpatch.pl (no need to follow
every advice that script provides of course, decide for yourself).

> +	/* NOTE: assign_sock consumes our sk reference */
> +	if (sk && nf_tproxy_sk_is_transparent(sk)) {
> +		nf_tproxy_assign_sock(skb, sk);
> +		return;
> +	}
> +
> +	regs->verdict.code = NFT_BREAK;
> +}

So ipv4 and ipv6 behavce differenty?
Why does one set BREAK but not the other?

I *guess* its best to BREAK in case sk wasn't assigned for both.

> +	plen = sizeof(u16);
> +	if (tb[NFTA_TPROXY_REG_PORT]) {
> +		priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
> +		err = nft_validate_register_load(priv->sreg_port, plen);

I think you can remove plen and just pass sizeof(__be16).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso June 20, 2018, 12:18 p.m. UTC | #2
A few comments on top of Florian's.

On Wed, Jun 20, 2018 at 12:41:29PM +0200, Máté Eckl wrote:
[...]
> +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> +static void nft_tproxy_eval_v6(const struct nft_expr *expr,
> +			    struct nft_regs *regs,
> +			    const struct nft_pktinfo *pkt)

Hm, better place IPv6 code in net/netfilter/nft_tproxy_ipv6.c ?

@Florian, do you prefer this monolitic style maybe?

I mean, I'll be fine either way.

[...]
> +static int nft_tproxy_init(const struct nft_ctx *ctx,
> +			   const struct nft_expr *expr,
> +			   const struct nlattr * const tb[])
> +{

I'm missing nf_defrag_ipv6_enable() calls from your _init() path.

> +	struct nft_tproxy *priv = nft_expr_priv(expr);
> +	unsigned int alen = 0, plen = 0;
> +	int err;
> +
> +	if (!tb[NFTA_TPROXY_FAMILY])
> +		return -EINVAL;
> +
> +	switch(ctx->family) {
> +	case NFPROTO_IPV4:
> +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> +	case NFPROTO_IPV6:
> +#endif
> +	case NFPROTO_INET:
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
> +	if ((priv->family == NFPROTO_IPV4 && ctx->family == NFPROTO_IPV6) ||
> +	    (priv->family == NFPROTO_IPV6 && ctx->family == NFPROTO_IPV4))
> +		return -EINVAL;
> +
> +	switch (priv->family) {
> +	case NFPROTO_IPV4:
> +		alen = FIELD_SIZEOF(union nf_inet_addr, in);
> +		break;
> +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> +	case NFPROTO_IPV6:
> +		alen = FIELD_SIZEOF(union nf_inet_addr, in6);
> +		break;
> +#endif
> +	case NFPROTO_INET:
> +		/* No address is specified here */
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (tb[NFTA_TPROXY_REG_ADDR]) {
> +		priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
> +		err = nft_validate_register_load(priv->sreg_addr, alen);
> +		if (err < 0)
> +			return err;
> +	}
> +
> +	plen = sizeof(u16);
> +	if (tb[NFTA_TPROXY_REG_PORT]) {
> +		priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
> +		err = nft_validate_register_load(priv->sreg_port, plen);
> +		if (err < 0)
> +			return err;
> +	}

How does this work if neither if neither address nor port are set?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal June 20, 2018, 12:40 p.m. UTC | #3
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> A few comments on top of Florian's.
> 
> On Wed, Jun 20, 2018 at 12:41:29PM +0200, Máté Eckl wrote:
> [...]
> > +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> > +static void nft_tproxy_eval_v6(const struct nft_expr *expr,
> > +			    struct nft_regs *regs,
> > +			    const struct nft_pktinfo *pkt)
> 
> Hm, better place IPv6 code in net/netfilter/nft_tproxy_ipv6.c ?
> 
> @Florian, do you prefer this monolitic style maybe?

Yes, I think we do way to many silly tinymodules.
A kernel module < 4k is really silly...

Alternative is to also split the core infra (used by nft and xt_TPROXY)
but I don't want to overengineer this.

> I'm missing nf_defrag_ipv6_enable() calls from your _init() path.

Yes, ineed.
Note that i plan to kill nf_defrag as separate module and replace it
by direct defragmentation calls at one point (just FYI, no action
needed).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso June 20, 2018, 12:50 p.m. UTC | #4
On Wed, Jun 20, 2018 at 02:40:09PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > A few comments on top of Florian's.
> > 
> > On Wed, Jun 20, 2018 at 12:41:29PM +0200, Máté Eckl wrote:
> > [...]
> > > +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> > > +static void nft_tproxy_eval_v6(const struct nft_expr *expr,
> > > +			    struct nft_regs *regs,
> > > +			    const struct nft_pktinfo *pkt)
> > 
> > Hm, better place IPv6 code in net/netfilter/nft_tproxy_ipv6.c ?
> > 
> > @Florian, do you prefer this monolitic style maybe?
> 
> Yes, I think we do way to many silly tinymodules.
> A kernel module < 4k is really silly...
> 
> Alternative is to also split the core infra (used by nft and xt_TPROXY)
> but I don't want to overengineer this.

That's fine, as long as this doesn't pull direct dependencies, this is
fine.

> > I'm missing nf_defrag_ipv6_enable() calls from your _init() path.
> 
> Yes, ineed.
> Note that i plan to kill nf_defrag as separate module and replace it
> by direct defragmentation calls at one point (just FYI, no action
> needed).

Thanks for reminder.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot June 20, 2018, 3:44 p.m. UTC | #5
Hi Máté,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on nf-next/master]

url:    https://github.com/0day-ci/linux/commits/M-t-Eckl/netfilter-Add-native-tproxy-support-for-nf_tables/20180620-222749
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git master
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 8.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=8.1.0 make.cross ARCH=ia64 

All errors (new ones prefixed by >>):

   net/netfilter/nft_tproxy.c: In function 'nft_tproxy_eval_v4':
>> net/netfilter/nft_tproxy.c:65:3: error: implicit declaration of function 'nf_tproxy_assign_sock'; did you mean 'nf_tproxy_get_sock_v6'? [-Werror=implicit-function-declaration]
      nf_tproxy_assign_sock(skb, sk);
      ^~~~~~~~~~~~~~~~~~~~~
      nf_tproxy_get_sock_v6
   cc1: some warnings being treated as errors

vim +65 net/netfilter/nft_tproxy.c

    16	
    17	static void nft_tproxy_eval_v4(const struct nft_expr *expr,
    18				    struct nft_regs *regs,
    19				    const struct nft_pktinfo *pkt)
    20	{
    21		const struct nft_tproxy *priv = nft_expr_priv(expr);
    22		struct sk_buff *skb = pkt->skb;
    23		struct sock *sk = skb->sk;
    24		const struct iphdr *iph = ip_hdr(skb);
    25		struct udphdr _hdr, *hp;
    26		__be32 taddr = 0;
    27		__be16 tport = 0;
    28	
    29		hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
    30		if (!hp)
    31			regs->verdict.code = NFT_BREAK;
    32	
    33		/* check if there's an ongoing connection on the packet
    34		 * addresses, this happens if the redirect already happened
    35		 * and the current packet belongs to an already established
    36		 * connection */
    37		sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
    38					   iph->saddr, iph->daddr,
    39					   hp->source, hp->dest,
    40					   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
    41	
    42		if (priv->sreg_addr)
    43			taddr = regs->data[priv->sreg_addr];
    44		taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
    45	
    46		if (priv->sreg_port) {
    47			tport = regs->data[priv->sreg_port];
    48		}
    49		if (!tport)
    50			tport = hp->dest;
    51	
    52		/* UDP has no TCP_TIME_WAIT state, so we never enter here */
    53		if (sk && sk->sk_state == TCP_TIME_WAIT)
    54			/* reopening a TIME_WAIT connection needs special handling */
    55			sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
    56		else if (!sk)
    57			/* no, there's no established connection, check if
    58			 * there's a listener on the redirected addr/port */
    59			sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
    60						   iph->saddr, taddr,
    61						   hp->source, tport,
    62						   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
    63	
    64		if (sk && nf_tproxy_sk_is_transparent(sk)) {
  > 65			nf_tproxy_assign_sock(skb, sk);
    66		}
    67	}
    68	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Máté Eckl June 21, 2018, 2:23 p.m. UTC | #6
On Wed, Jun 20, 2018 at 01:36:49PM +0200, Florian Westphal wrote:
> Máté Eckl <ecklm94@gmail.com> wrote:
> > There are some changes compared to the iptables implementation:
> >  - tproxy statement is not terminal here
> >  - no transport protocol criterion is necessary to set target ip address
> 
> > +	const struct nft_tproxy *priv = nft_expr_priv(expr);
> > +	struct sk_buff *skb = pkt->skb;
> > +	struct sock *sk = skb->sk;
> > +	const struct iphdr *iph = ip_hdr(skb);
> > +	struct udphdr _hdr, *hp;
> > +	__be32 taddr = 0;
> > +	__be16 tport = 0;
> > +
> > +	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
> > +	if (!hp)
> > +		regs->verdict.code = NFT_BREAK;
> 
> This is missing needed 'return'.

Fixed.

> 
> > +	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
> > +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> > +		/* reopening a TIME_WAIT connection needs special handling */
> > +		sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
> > +	else if (!sk)
> > +		/* no, there's no established connection, check if
> > +		 * there's a listener on the redirected addr/port */
> > +		sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
> > +					   iph->saddr, taddr,
> > +					   hp->source, tport,
> > +					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
> 
> This looks like its subtly broken, inherited from xt_TPROXY.
> Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
> assumes it gets tcphdr (it checks th->doff, and that might be garbage).

I thought about why iptables uses udphdr consequently and I think it does
because we do not nead other than source and destination address and port which
is part of the udp header too at the same position.

I think they paid attention to this. nf_tproxy_get_sock_v4 treats that pointer
as a tcphdr indeed, but it only uses tcp-related attributes and functions if ip
protocol is IPPROTO_TCP, so what you described does not happen with an udp
packet.

> So I suggest to remove 'hp' argument from nf_tproxy_get_sock_v4/6 and repeat
> the skb_header_pointer() call there, using struct tcphdr size as backend
> storage for TCP case.
> 
> This will need to be a extra patch vs. nf.git tree.
> 
> > +	if (sk && nf_tproxy_sk_is_transparent(sk)) {
> > +		nf_tproxy_assign_sock(skb, sk);
> > +	}
> 
> No need for extra { }, see scripts/checkpatch.pl (no need to follow
> every advice that script provides of course, decide for yourself).

I thought I removed all of these. I'll pay more attention to these.

> 
> > +	/* NOTE: assign_sock consumes our sk reference */
> > +	if (sk && nf_tproxy_sk_is_transparent(sk)) {
> > +		nf_tproxy_assign_sock(skb, sk);
> > +		return;
> > +	}
> > +
> > +	regs->verdict.code = NFT_BREAK;
> > +}
> 
> So ipv4 and ipv6 behavce differenty?
> Why does one set BREAK but not the other?
> 
> I *guess* its best to BREAK in case sk wasn't assigned for both.

It's an error of mine. I'll review the v4/v6 once more and eliminate these
differences.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal June 21, 2018, 2:31 p.m. UTC | #7
Máté Eckl <ecklm94@gmail.com> wrote:
> > This looks like its subtly broken, inherited from xt_TPROXY.
> > Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
> > assumes it gets tcphdr (it checks th->doff, and that might be garbage).
> 
> I thought about why iptables uses udphdr consequently and I think it does
> because we do not nead other than source and destination address and port which
> is part of the udp header too at the same position.

It does for LISTEN case, see __tcp_hdrlen() usage in
nf_tproxy_get_sock_v4.

> I think they paid attention to this. nf_tproxy_get_sock_v4 treats that pointer
> as a tcphdr indeed, but it only uses tcp-related attributes and functions if ip
> protocol is IPPROTO_TCP, so what you described does not happen with an udp
> packet.

It doesn't happen with an udp packet.  But in case tcp header wasn't in
linar area (skb->data), but in pagefrags (or split), it will be copied
by skb_header_pointer to __udphdr (on stack), so in that case we then
get out-of-bounds read access.

Hence my suggestion to remove 'hp' arg and repeat skb_header_pointer()
call with struct tcphdr.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Máté Eckl June 22, 2018, 7:33 a.m. UTC | #8
On Wed, Jun 20, 2018 at 02:18:25PM +0200, Pablo Neira Ayuso wrote:
> A few comments on top of Florian's.
> 
> On Wed, Jun 20, 2018 at 12:41:29PM +0200, Máté Eckl wrote:
> [...]
> > +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> > +static void nft_tproxy_eval_v6(const struct nft_expr *expr,
> > +			    struct nft_regs *regs,
> > +			    const struct nft_pktinfo *pkt)
> 
> Hm, better place IPv6 code in net/netfilter/nft_tproxy_ipv6.c ?
> 
> @Florian, do you prefer this monolitic style maybe?
> 
> I mean, I'll be fine either way.
> 
> [...]
> > +static int nft_tproxy_init(const struct nft_ctx *ctx,
> > +			   const struct nft_expr *expr,
> > +			   const struct nlattr * const tb[])
> > +{
> 
> I'm missing nf_defrag_ipv6_enable() calls from your _init() path.

I added them.

> > +	struct nft_tproxy *priv = nft_expr_priv(expr);
> > +	unsigned int alen = 0, plen = 0;
> > +	int err;
> > +
> > +	if (!tb[NFTA_TPROXY_FAMILY])
> > +		return -EINVAL;
> > +
> > +	switch(ctx->family) {
> > +	case NFPROTO_IPV4:
> > +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> > +	case NFPROTO_IPV6:
> > +#endif
> > +	case NFPROTO_INET:
> > +		break;
> > +	default:
> > +		return -EOPNOTSUPP;
> > +	}
> > +
> > +	priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
> > +	if ((priv->family == NFPROTO_IPV4 && ctx->family == NFPROTO_IPV6) ||
> > +	    (priv->family == NFPROTO_IPV6 && ctx->family == NFPROTO_IPV4))
> > +		return -EINVAL;
> > +
> > +	switch (priv->family) {
> > +	case NFPROTO_IPV4:
> > +		alen = FIELD_SIZEOF(union nf_inet_addr, in);
> > +		break;
> > +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
> > +	case NFPROTO_IPV6:
> > +		alen = FIELD_SIZEOF(union nf_inet_addr, in6);
> > +		break;
> > +#endif
> > +	case NFPROTO_INET:
> > +		/* No address is specified here */
> > +		break;
> > +	default:
> > +		return -EOPNOTSUPP;
> > +	}
> > +
> > +	if (tb[NFTA_TPROXY_REG_ADDR]) {
> > +		priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
> > +		err = nft_validate_register_load(priv->sreg_addr, alen);
> > +		if (err < 0)
> > +			return err;
> > +	}
> > +
> > +	plen = sizeof(u16);
> > +	if (tb[NFTA_TPROXY_REG_PORT]) {
> > +		priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
> > +		err = nft_validate_register_load(priv->sreg_port, plen);
> > +		if (err < 0)
> > +			return err;
> > +	}
> 
> How does this work if neither if neither address nor port are set?

It uses the original destination ip address and port for socket lookup. Florian
already drew my attention to that it does not make any sense, so I'll remove this
scenario, address or port will have to be specified.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Máté Eckl June 22, 2018, 3:48 p.m. UTC | #9
On Thu, Jun 21, 2018 at 04:31:48PM +0200, Florian Westphal wrote:
> Máté Eckl <ecklm94@gmail.com> wrote:
> > > This looks like its subtly broken, inherited from xt_TPROXY.
> > > Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
> > > assumes it gets tcphdr (it checks th->doff, and that might be garbage).
> > 
> > I thought about why iptables uses udphdr consequently and I think it does
> > because we do not nead other than source and destination address and port which
> > is part of the udp header too at the same position.
> 
> It does for LISTEN case, see __tcp_hdrlen() usage in
> nf_tproxy_get_sock_v4.
> 
> > I think they paid attention to this. nf_tproxy_get_sock_v4 treats that pointer
> > as a tcphdr indeed, but it only uses tcp-related attributes and functions if ip
> > protocol is IPPROTO_TCP, so what you described does not happen with an udp
> > packet.
> 
> It doesn't happen with an udp packet.  But in case tcp header wasn't in
> linar area (skb->data), but in pagefrags (or split), it will be copied
> by skb_header_pointer to __udphdr (on stack), so in that case we then
> get out-of-bounds read access.
> 
> Hence my suggestion to remove 'hp' arg and repeat skb_header_pointer()
> call with struct tcphdr.

Ok, I made the patch. Do you agree with this commit message? This linearisation
thing is not clear to me yet.

	netfilter: Refactor nf_tproxy_get_sock_v{4,6}

	This patch fixes a silent out-of-bound read possibility that was present
	because of the misuse of this function.

	Mostly it was called with a struct udphdr *hp which had only the udphdr
	part linearized by the skb_header_pointer, however the
	nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
	tcp specific flags may be invalid.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal June 22, 2018, 4:24 p.m. UTC | #10
Máté Eckl <ecklm94@gmail.com> wrote:
> On Thu, Jun 21, 2018 at 04:31:48PM +0200, Florian Westphal wrote:
> > Máté Eckl <ecklm94@gmail.com> wrote:
> > > > This looks like its subtly broken, inherited from xt_TPROXY.
> > > > Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
> > > > assumes it gets tcphdr (it checks th->doff, and that might be garbage).
> > > 
> > > I thought about why iptables uses udphdr consequently and I think it does
> > > because we do not nead other than source and destination address and port which
> > > is part of the udp header too at the same position.
> > 
> > It does for LISTEN case, see __tcp_hdrlen() usage in
> > nf_tproxy_get_sock_v4.
> > 
> > > I think they paid attention to this. nf_tproxy_get_sock_v4 treats that pointer
> > > as a tcphdr indeed, but it only uses tcp-related attributes and functions if ip
> > > protocol is IPPROTO_TCP, so what you described does not happen with an udp
> > > packet.
> > 
> > It doesn't happen with an udp packet.  But in case tcp header wasn't in
> > linar area (skb->data), but in pagefrags (or split), it will be copied
> > by skb_header_pointer to __udphdr (on stack), so in that case we then
> > get out-of-bounds read access.
> > 
> > Hence my suggestion to remove 'hp' arg and repeat skb_header_pointer()
> > call with struct tcphdr.
> 
> Ok, I made the patch. Do you agree with this commit message? This linearisation
> thing is not clear to me yet.
> 
> 	netfilter: Refactor nf_tproxy_get_sock_v{4,6}
> 
> 	This patch fixes a silent out-of-bound read possibility that was present
> 	because of the misuse of this function.
> 
> 	Mostly it was called with a struct udphdr *hp which had only the udphdr
> 	part linearized by the skb_header_pointer, however the
> 	nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
> 	tcp specific flags may be invalid.

Yes, looks ok.

sk_buff references packet data in two ways:

1. linear part, this referenced via skb->head and skb->data (head points
to start of buffer, skb->data is moved around as headers are
added/removed).

2. nonlinar part.  This is referenced via skb_shinfo area, and points to
pages of memory, i.e. an skb can be made up of dozens of pages.

It will typically be both for packets aggregated via GRO (receive
offloading) or built on transmit.

In this particular case (LISTEN lookup, we'll normally always see 1)
only, which explains why this wasn't noticed before, in case of 1)
current code is fine.

In second case, accessing skb->data[bignum] may cause out-of-bounds read,
as the data might reside in the "non-linear area", i.e. skb->head memory
is much smaller.

One has to use the 'big hammer', skb_linearize(), to force reallocation
of skb->head + copy of all pages' contents into it (very expensive,
might not work if memory is fragmented and allocation request is large),
or pskb_may_pull() with the amount of bytes you expect to be accessible
via skb->head[] (doesn't do anything if its already accessible, or
causes reallocation of skb->head if its too small, but will only 'pull'
all pages, just whatever was requested).

To get access to a (small) header, you can also use
skb_header_pointer(), which never reallocates skb->head.

If the requested access is already ok (because requested start offset +
size is accessible via skb->head), it just returns a pointer to the
offset in the linear area.

If not, it copies that requested size into the buffer that gets passed
to the function, and returns a pointer to that buffer rather than to
skb->head region.

In this case, we asked for sizeof(udphdr).
If sizeof(tcphdr) (yes, tcphdr) would be ok/in linear area, everything
is fine, the returned udp-header pointer can be re-used/treated like
tcp.

But in the second case, the pointer returned could be the address of the
_udphdr stack-buffer.
It would be fine if only accessing those tcp fields that are <=
sizeof(udphdr), but tcphdr->doff isn't within that region.

If you're interested in the gory details, look at skb_header_pointer and
pskb_may_pull() implementations.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Máté Eckl June 24, 2018, 12:36 p.m. UTC | #11
On Fri, Jun 22, 2018 at 06:24:51PM +0200, Florian Westphal wrote:
> Máté Eckl <ecklm94@gmail.com> wrote:
> > On Thu, Jun 21, 2018 at 04:31:48PM +0200, Florian Westphal wrote:
> > > Máté Eckl <ecklm94@gmail.com> wrote:
> > > > > This looks like its subtly broken, inherited from xt_TPROXY.
> > > > > Above skb_header_pointer uses sizeof(udphdr) only, but nf_tproxy_get_sock_v4
> > > > > assumes it gets tcphdr (it checks th->doff, and that might be garbage).
> > > > 
> > > > I thought about why iptables uses udphdr consequently and I think it does
> > > > because we do not nead other than source and destination address and port which
> > > > is part of the udp header too at the same position.
> > > 
> > > It does for LISTEN case, see __tcp_hdrlen() usage in
> > > nf_tproxy_get_sock_v4.
> > > 
> > > > I think they paid attention to this. nf_tproxy_get_sock_v4 treats that pointer
> > > > as a tcphdr indeed, but it only uses tcp-related attributes and functions if ip
> > > > protocol is IPPROTO_TCP, so what you described does not happen with an udp
> > > > packet.
> > > 
> > > It doesn't happen with an udp packet.  But in case tcp header wasn't in
> > > linar area (skb->data), but in pagefrags (or split), it will be copied
> > > by skb_header_pointer to __udphdr (on stack), so in that case we then
> > > get out-of-bounds read access.
> > > 
> > > Hence my suggestion to remove 'hp' arg and repeat skb_header_pointer()
> > > call with struct tcphdr.
> > 
> > Ok, I made the patch. Do you agree with this commit message? This linearisation
> > thing is not clear to me yet.
> > 
> > 	netfilter: Refactor nf_tproxy_get_sock_v{4,6}
> > 
> > 	This patch fixes a silent out-of-bound read possibility that was present
> > 	because of the misuse of this function.
> > 
> > 	Mostly it was called with a struct udphdr *hp which had only the udphdr
> > 	part linearized by the skb_header_pointer, however the
> > 	nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
> > 	tcp specific flags may be invalid.
> 
> Yes, looks ok.
> 
> sk_buff references packet data in two ways:
> 
> 1. linear part, this referenced via skb->head and skb->data (head points
> to start of buffer, skb->data is moved around as headers are
> added/removed).
> 
> 2. nonlinar part.  This is referenced via skb_shinfo area, and points to
> pages of memory, i.e. an skb can be made up of dozens of pages.
> 
> It will typically be both for packets aggregated via GRO (receive
> offloading) or built on transmit.
> 
> In this particular case (LISTEN lookup, we'll normally always see 1)
> only, which explains why this wasn't noticed before, in case of 1)
> current code is fine.
> 
> In second case, accessing skb->data[bignum] may cause out-of-bounds read,
> as the data might reside in the "non-linear area", i.e. skb->head memory
> is much smaller.
> 
> One has to use the 'big hammer', skb_linearize(), to force reallocation
> of skb->head + copy of all pages' contents into it (very expensive,
> might not work if memory is fragmented and allocation request is large),
> or pskb_may_pull() with the amount of bytes you expect to be accessible
> via skb->head[] (doesn't do anything if its already accessible, or
> causes reallocation of skb->head if its too small, but will only 'pull'
> all pages, just whatever was requested).
> 
> To get access to a (small) header, you can also use
> skb_header_pointer(), which never reallocates skb->head.
> 
> If the requested access is already ok (because requested start offset +
> size is accessible via skb->head), it just returns a pointer to the
> offset in the linear area.
> 
> If not, it copies that requested size into the buffer that gets passed
> to the function, and returns a pointer to that buffer rather than to
> skb->head region.
> 
> In this case, we asked for sizeof(udphdr).
> If sizeof(tcphdr) (yes, tcphdr) would be ok/in linear area, everything
> is fine, the returned udp-header pointer can be re-used/treated like
> tcp.
> 
> But in the second case, the pointer returned could be the address of the
> _udphdr stack-buffer.
> It would be fine if only accessing those tcp fields that are <=
> sizeof(udphdr), but tcphdr->doff isn't within that region.
> 
> If you're interested in the gory details, look at skb_header_pointer and
> pskb_may_pull() implementations.

Thanks very much for the extensive description.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c9bf74b94f37..565e1cafb36f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1250,6 +1250,22 @@  enum nft_nat_attributes {
 };
 #define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
 
+/**
+ * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes
+ *
+ * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers)
+ * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers)
+ * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers)
+ */
+enum nft_tproxy_attributes {
+	NFTA_TPROXY_UNSPEC,
+	NFTA_TPROXY_FAMILY,
+	NFTA_TPROXY_REG_ADDR,
+	NFTA_TPROXY_REG_PORT,
+	__NFTA_TPROXY_MAX
+};
+#define NFTA_TPROXY_MAX		(__NFTA_TPROXY_MAX - 1)
+
 /**
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8abcefb8b418..39eed9f32fc8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -631,6 +631,14 @@  config NFT_SOCKET
 	  This option allows matching for the presence or absence of a
 	  corresponding socket and its attributes.
 
+config NFT_TPROXY
+	tristate "Netfilter nf_tables tproxy support"
+	depends on IPV6 || IPV6=n
+	select NF_TPROXY_IPV4
+	select NF_TPROXY_IPV6 if NF_TABLES_IPV6
+	help
+	  This makes transparent proxy support available in nftables.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..2af51df46d71 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -104,6 +104,7 @@  obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NF_OSF)		+= nf_osf.o
 obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
+obj-$(CONFIG_NFT_TPROXY)	+= nft_tproxy.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
new file mode 100644
index 000000000000..bb756d633e1a
--- /dev/null
+++ b/net/netfilter/nft_tproxy.c
@@ -0,0 +1,297 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tproxy.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+
+struct nft_tproxy {
+	enum nft_registers      sreg_addr:8;
+	enum nft_registers      sreg_port:8;
+	u8			family;
+};
+
+static void nft_tproxy_eval_v4(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	struct sock *sk = skb->sk;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr _hdr, *hp;
+	__be32 taddr = 0;
+	__be16 tport = 0;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (!hp)
+		regs->verdict.code = NFT_BREAK;
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
+
+	if (priv->sreg_addr)
+		taddr = regs->data[priv->sreg_addr];
+	taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
+
+	if (priv->sreg_port) {
+		tport = regs->data[priv->sreg_port];
+	}
+	if (!tport)
+		tport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
+	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, hp, iph->protocol,
+					   iph->saddr, taddr,
+					   hp->source, tport,
+					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
+		nf_tproxy_assign_sock(skb, sk);
+	}
+}
+
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
+static void nft_tproxy_eval_v6(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	struct sock *sk = skb->sk;
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp;
+	struct in6_addr taddr = {0};
+	__be16 tport = 0;
+	int thoff = 0;
+	int l4proto;
+
+	l4proto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+	if (l4proto < 0) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, hp, l4proto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED);
+
+	if (priv->sreg_addr)
+		memcpy (&taddr, &regs->data[priv->sreg_addr], sizeof(taddr));
+	taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
+
+	if (priv->sreg_port) {
+		tport = regs->data[priv->sreg_port];
+	}
+	if (!tport)
+		tport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT) {
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff,
+						 nft_net(pkt),
+						 &taddr,
+						 tport,
+						 sk);
+	}
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, hp,
+					   l4proto, &iph->saddr, &taddr,
+					   hp->source, tport,
+					   nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
+		nf_tproxy_assign_sock(skb, sk);
+		return;
+	}
+
+	regs->verdict.code = NFT_BREAK;
+}
+#endif
+
+static void nft_tproxy_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		switch (priv->family) {
+		case NFPROTO_IPV4:
+		case NFPROTO_INET:
+			nft_tproxy_eval_v4(expr, regs, pkt);
+			break;
+		default:
+			regs->verdict.code = NFT_BREAK;
+			break;
+		}
+		break;
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
+	case NFPROTO_IPV6:
+		switch (priv->family) {
+		case NFPROTO_IPV6:
+		case NFPROTO_INET:
+			nft_tproxy_eval_v6(expr, regs, pkt);
+			break;
+		default:
+			regs->verdict.code = NFT_BREAK;
+			break;
+		}
+		break;
+#endif
+	}
+}
+
+static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
+	[NFTA_TPROXY_FAMILY]   = { .type = NLA_U32 },
+	[NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
+	[NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
+};
+
+static int nft_tproxy_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_tproxy *priv = nft_expr_priv(expr);
+	unsigned int alen = 0, plen = 0;
+	int err;
+
+	if (!tb[NFTA_TPROXY_FAMILY])
+		return -EINVAL;
+
+	switch(ctx->family) {
+	case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
+	case NFPROTO_IPV6:
+#endif
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
+	if ((priv->family == NFPROTO_IPV4 && ctx->family == NFPROTO_IPV6) ||
+	    (priv->family == NFPROTO_IPV6 && ctx->family == NFPROTO_IPV4))
+		return -EINVAL;
+
+	switch (priv->family) {
+	case NFPROTO_IPV4:
+		alen = FIELD_SIZEOF(union nf_inet_addr, in);
+		break;
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6)
+	case NFPROTO_IPV6:
+		alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+		break;
+#endif
+	case NFPROTO_INET:
+		/* No address is specified here */
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (tb[NFTA_TPROXY_REG_ADDR]) {
+		priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
+		err = nft_validate_register_load(priv->sreg_addr, alen);
+		if (err < 0)
+			return err;
+	}
+
+	plen = sizeof(u16);
+	if (tb[NFTA_TPROXY_REG_PORT]) {
+		priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
+		err = nft_validate_register_load(priv->sreg_port, plen);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int nft_tproxy_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+	if(nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family)))
+		return -1;
+
+	if (priv->sreg_addr) {
+		if (nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr))
+			return -1;
+	}
+
+	if (priv->sreg_port) {
+		if (nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port))
+			return -1;
+	}
+
+	return 0;
+}
+
+static struct nft_expr_type nft_tproxy_type;
+static const struct nft_expr_ops nft_tproxy_ops = {
+	.type		= &nft_tproxy_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
+	.eval		= nft_tproxy_eval,
+	.init		= nft_tproxy_init,
+	.dump		= nft_tproxy_dump,
+};
+
+static struct nft_expr_type nft_tproxy_type __read_mostly = {
+	.name		= "tproxy",
+	.ops		= &nft_tproxy_ops,
+	.policy		= nft_tproxy_policy,
+	.maxattr	= NFTA_TPROXY_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_tproxy_module_init(void)
+{
+	return nft_register_expr(&nft_tproxy_type);
+}
+
+static void __exit nft_tproxy_module_exit(void)
+{
+	nft_unregister_expr(&nft_tproxy_type);
+}
+
+module_init(nft_tproxy_module_init);
+module_exit(nft_tproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables tproxy support module");
+MODULE_ALIAS_NFT_EXPR("tproxy");