diff mbox

[v3,1/1] netfilter: Add fail-open support.

Message ID 20120522121048.880.22605.sendpatchset@localhost.localdomain
State Superseded
Headers show

Commit Message

Krishna Kumar May 22, 2012, 12:10 p.m. UTC
Implement a new "fail-open" mode where packets are not dropped
upon queue-full condition. This mode can be individually enabled
or disabled per queue using netlink NFAQ_CFG_FLAGS & NFAQ_CFG_MASK
attributes.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Vivek Kashyap <vivk@us.ibm.com>
Signed-off-by: Sridhar Samudrala <samudrala@us.ibm.com>
---
 include/linux/netfilter/nfnetlink_queue.h |    5 ++
 net/netfilter/core.c                      |   37 +++++++++++++++++++-
 net/netfilter/nf_queue.c                  |   15 ++++++--
 net/netfilter/nfnetlink_queue.c           |   37 ++++++++++++++++++--
 4 files changed, 87 insertions(+), 7 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Florian Westphal May 22, 2012, 2:38 p.m. UTC | #1
Krishna Kumar <krkumar2@in.ibm.com> wrote:
> Implement a new "fail-open" mode where packets are not dropped
> upon queue-full condition. This mode can be individually enabled
> or disabled per queue using netlink NFAQ_CFG_FLAGS & NFAQ_CFG_MASK
> attributes.
> 
>  	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
> +	NFQA_CFG_MASK,			/* identify which flags to change */
> +	NFQA_CFG_FLAGS,			/* value of these flags (__u32) */

__be32?
I see that QUEUE_MAXLEN gets ntohl treatment, too....

>  	__NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
>  
> +/* Flags for NFQA_CFG_FLAGS */
> +#define NFQA_CFG_F_FAIL_OPEN			(1 << 0)
> +
>  #endif /* _NFNETLINK_QUEUE_H */
> diff -ruNp org/net/netfilter/core.c new/net/netfilter/core.c
> --- org/net/netfilter/core.c	2012-05-22 08:45:32.651608253 +0530
> +++ new/net/netfilter/core.c	2012-05-22 17:35:51.294216873 +0530
> @@ -163,6 +163,31 @@ repeat:
>  	return NF_ACCEPT;
>  }
>  
> +/*
> + * Handler was not able to enqueue the packet, and returned ENOSPC
> + * since "fail-open" was enabled. We temporarily accept the skb, or
> + * each segment for a segmented skb, and then free up the header.
> + */
> +static void handle_fail_open(struct sk_buff *skb,
> +			     int (*okfn)(struct sk_buff *))
> +{
> +	struct sk_buff *segs;
> +	bool gso;
> +
> +	segs = skb->next ? : skb;
> +	gso = skb->next != NULL;
> +
> +	do {
> +		struct sk_buff *nskb = segs->next;
> +
> +		segs->next = NULL;
> +		okfn(segs);
> +		segs = nskb;
> +	} while (segs);
> +
> +	if (gso)
> +		kfree_skb(skb);
> +}

I don't understand why this is needed at all.
Conceptually, what you're doing is identical to the
--nfqueue-bypass feature, so it should be enough to change

> @@ -199,10 +226,18 @@ next_hook:
>  			if (err == -ESRCH &&
>  			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
>  				goto next_hook;
> +			if (err == -ENOSPC) {
> +				failopen = 1;
> +				goto next_hook;
> +			}
>  			kfree_skb(skb);

to

 if (err == -ENOSPC || (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
  	goto next_hook;

[ or, take advantage of the existing -ECANCELED and have the queueing
  backend return that if queue is full and fail-open is enabled ]

Yes, that means that if the userspace ruleset is
-j NFQUEUE
-j DROP

then your packets will be dropped even if the userspace application
enables failopen.

But thats a feature, since you could also do
-j NFQUEUE
-m limt ... -j LOG --log-prefix "queue overflow"

or play extra games wrt. "drop if established, CONNMARK
for future bypass if --ctstate NEW", etc.

If its a requirment for you that userspace can force ACCEPT
regardless of ruleset, then perhaps it might be better to
add a separate "default verdict" option and invoke
nf_reinject() directly from the qeueueing backend instead
of passing the fail-open information back to nf_hook_slow?

> +		flags = nla_data(nfqa[NFQA_CFG_FLAGS]);
> +		mask = nla_data(nfqa[NFQA_CFG_MASK]);

nla_get_be32()?
[ _u32 would make more sense to me, but other attributes are be32 too,
  so I'm ok with it ]

> -		if (err == 0)
> +
> +		if (err == 0) {
>  			queued++;
> -		else
> +		} else if (err == -ENOSPC) {
> +			/* Queue failed due to queue-full and handler is
> +			 * in "fail-open" mode.
> +			 */
> +			segs->next = nskb;
> +			skb->next = segs;
> +			break;
> +		} else {
>  			kfree_skb(segs);
> +		}
>  		segs = nskb;
>  	} while (segs);
>  
> -	if (queued) {
> +	if (queued && err != -ENOSPC) {
>  		kfree_skb(skb);
>  		return 0;
>  	}

Similarily, this shouldn't be needed either any more since you
no longer need to check for -ENOSPC (existing --queue-bypass behaviour
should handle your case, too).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar May 23, 2012, 6:45 a.m. UTC | #2
Florian Westphal <fw@strlen.de> wrote on 05/22/2012 08:08:58 PM:

Thanks for your review.

> >
> >     NFQA_CFG_QUEUE_MAXLEN,      /* __u32 */
> > +   NFQA_CFG_MASK,         /* identify which flags to change */
> > +   NFQA_CFG_FLAGS,         /* value of these flags (__u32) */
>
> __be32?

Yes, I will change this.

> I see that QUEUE_MAXLEN gets ntohl treatment, too....
>
> > +static void handle_fail_open(struct sk_buff *skb,
> > +              int (*okfn)(struct sk_buff *))
> > +{
> > +   struct sk_buff *segs;
> > +   bool gso;
> > +
> > +   segs = skb->next ? : skb;
> > +   gso = skb->next != NULL;
> > +
> > +   do {
> > +      struct sk_buff *nskb = segs->next;
> > +
> > +      segs->next = NULL;
> > +      okfn(segs);
> > +      segs = nskb;
> > +   } while (segs);
> > +
> > +   if (gso)
> > +      kfree_skb(skb);
> > +}
>
> I don't understand why this is needed at all.
> Conceptually, what you're doing is identical to the
> --nfqueue-bypass feature, so it should be enough to change

Yes, I could use the same code. However I am not clear about the
ESRCH path since it doesn't seem to handle GSO skb correctly? See
nf_queue below where it calls kfree_skb even for -ESRCH:

	if (err == 0) {
		nf_bridge_adjust_segmented_data(segs);
		err = __nf_queue(segs, elem, pf, hook, indev,
					outdev, okfn, queuenum);
	}
	if (err == 0)
		queued++;
	else
		kfree_skb(segs);

Maybe this needs a check for ESRCH and return back to hook_slow
to work correctly? If this is the case, I can submit a patch to
fix this, and piggy-back ESRCH for fail-open too.

> > @@ -199,10 +226,18 @@ next_hook:
> >           if (err == -ESRCH &&
> >              (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
> >              goto next_hook;
> > +         if (err == -ENOSPC) {
> > +            failopen = 1;
> > +            goto next_hook;
> > +         }
> >           kfree_skb(skb);
>
> to
>
>  if (err == -ENOSPC || (err == -ESRCH && (verdict &
> NF_VERDICT_FLAG_QUEUE_BYPASS))
>      goto next_hook;

This would work if the skb was not GSO. So maybe the above
fix for ESRCH will let me share the code for fail-open too?

> [ or, take advantage of the existing -ECANCELED and have the queueing
>   backend return that if queue is full and fail-open is enabled ]
>
> Yes, that means that if the userspace ruleset is
> -j NFQUEUE
> -j DROP
>
> then your packets will be dropped even if the userspace application
> enables failopen.
>
> But thats a feature, since you could also do
> -j NFQUEUE
> -m limt ... -j LOG --log-prefix "queue overflow"
>
> or play extra games wrt. "drop if established, CONNMARK
> for future bypass if --ctstate NEW", etc.
>
> If its a requirment for you that userspace can force ACCEPT
> regardless of ruleset, then perhaps it might be better to
> add a separate "default verdict" option and invoke
> nf_reinject() directly from the qeueueing backend instead
> of passing the fail-open information back to nf_hook_slow?
>
> > +      flags = nla_data(nfqa[NFQA_CFG_FLAGS]);
> > +      mask = nla_data(nfqa[NFQA_CFG_MASK]);
>
> nla_get_be32()?
> [ _u32 would make more sense to me, but other attributes are be32 too,
>   so I'm ok with it ]

Yes, I will change to be32.

> > -      if (err == 0)
> > +
> > +      if (err == 0) {
> >           queued++;
> > -      else
> > +      } else if (err == -ENOSPC) {
> > +         /* Queue failed due to queue-full and handler is
> > +          * in "fail-open" mode.
> > +          */
> > +         segs->next = nskb;
> > +         skb->next = segs;
> > +         break;
> > +      } else {
> >           kfree_skb(segs);
> > +      }
> >        segs = nskb;
> >     } while (segs);
> >
> > -   if (queued) {
> > +   if (queued && err != -ENOSPC) {
> >        kfree_skb(skb);
> >        return 0;
> >     }
>
> Similarily, this shouldn't be needed either any more since you
> no longer need to check for -ENOSPC (existing --queue-bypass behaviour
> should handle your case, too).

Slight change could be needed here, since queued is 0 for --queue-bypass
case (and hence return -ESRCH), while it could be >0 for fail-open,
where we still want to return error as some segments were not queued.

Thanks,
- KK

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal May 23, 2012, 7:54 a.m. UTC | #3
Krishna Kumar2 <krkumar2@in.ibm.com> wrote:
> Florian Westphal <fw@strlen.de> wrote on 05/22/2012 08:08:58 PM:
> > I don't understand why this is needed at all.
> > Conceptually, what you're doing is identical to the
> > --nfqueue-bypass feature, so it should be enough to change
> 
> Yes, I could use the same code. However I am not clear about the
> ESRCH path since it doesn't seem to handle GSO skb correctly? See
> nf_queue below where it calls kfree_skb even for -ESRCH:
> 
> 	if (err == 0) {
> 		nf_bridge_adjust_segmented_data(segs);
> 		err = __nf_queue(segs, elem, pf, hook, indev,
> 					outdev, okfn, queuenum);
> 	}
> 	if (err == 0)
> 		queued++;
> 	else
> 		kfree_skb(segs);
> 
> Maybe this needs a check for ESRCH and return back to hook_slow
> to work correctly? If this is the case, I can submit a patch to
> fix this, and piggy-back ESRCH for fail-open too.

The idea for queue-bypass was to free the original (gso) skb
if we were able to queue at least one packet, i.e. the original
skb only continues traversal if queue bypassing is enabled
and no single segment could be queued.

If it is a requirement for you that any remaining segments
that could not be queued continue traversal, then yes,
the existing code won't work for you.

I don't think its necessary to piggyback ESRCH too; since
userspace should not bind/unbind the queue continuously
(i.e., a -ESRCH after some segments have been queued should
 be a rare condition).

However, if the code sharing is not too much of a burden
then it would be good to do it anyway.

> > > @@ -199,10 +226,18 @@ next_hook:
> > >           if (err == -ESRCH &&
> > >              (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
> > >              goto next_hook;
> > > +         if (err == -ENOSPC) {
> > > +            failopen = 1;
> > > +            goto next_hook;
> > > +         }
> > >           kfree_skb(skb);
> >
> > to
> >
> >  if (err == -ENOSPC || (err == -ESRCH && (verdict &
> > NF_VERDICT_FLAG_QUEUE_BYPASS))
> >      goto next_hook;
> 
> This would work if the skb was not GSO. So maybe the above
> fix for ESRCH will let me share the code for fail-open too?

See above; it works for GSO too if we could not queue a single segment.
I think that iff you need to handle the "some segments queued" case
and put that in here then handling -ESRCH with it too would be good.

> > [ or, take advantage of the existing -ECANCELED and have the queueing
> >   backend return that if queue is full and fail-open is enabled ]
> >
> > Yes, that means that if the userspace ruleset is
> > -j NFQUEUE
> > -j DROP
> >
> > then your packets will be dropped even if the userspace application
> > enables failopen.
> >
> > But thats a feature, since you could also do
> > -j NFQUEUE
> > -m limt ... -j LOG --log-prefix "queue overflow"
> >
> > or play extra games wrt. "drop if established, CONNMARK
> > for future bypass if --ctstate NEW", etc.
> >
> > If its a requirment for you that userspace can force ACCEPT
> > regardless of ruleset, then perhaps it might be better to
> > add a separate "default verdict" option and invoke
> > nf_reinject() directly from the qeueueing backend instead
> > of passing the fail-open information back to nf_hook_slow?
> >
> > > +      flags = nla_data(nfqa[NFQA_CFG_FLAGS]);
> > > +      mask = nla_data(nfqa[NFQA_CFG_MASK]);
> >
> > nla_get_be32()?
> > [ _u32 would make more sense to me, but other attributes are be32 too,
> >   so I'm ok with it ]
> 
> Yes, I will change to be32.
> 
> > > -      if (err == 0)
> > > +
> > > +      if (err == 0) {
> > >           queued++;
> > > -      else
> > > +      } else if (err == -ENOSPC) {
> > > +         /* Queue failed due to queue-full and handler is
> > > +          * in "fail-open" mode.
> > > +          */
> > > +         segs->next = nskb;
> > > +         skb->next = segs;
> > > +         break;
> > > +      } else {
> > >           kfree_skb(segs);
> > > +      }
> > >        segs = nskb;
> > >     } while (segs);
> > >
> > > -   if (queued) {
> > > +   if (queued && err != -ENOSPC) {
> > >        kfree_skb(skb);
> > >        return 0;
> > >     }
> >
> > Similarily, this shouldn't be needed either any more since you
> > no longer need to check for -ENOSPC (existing --queue-bypass behaviour
> > should handle your case, too).
> 
> Slight change could be needed here, since queued is 0 for --queue-bypass
> case (and hence return -ESRCH), while it could be >0 for fail-open,
> where we still want to return error as some segments were not queued.

Right, queued will almost certainly be 0 if the userspace listener is
not there anymore.

I wonder if it would be possible to just nf_reinject() directly
from nfqnl_enqueue_packet().

[ This means we loose the ability to add rules after NFQUEUE to detect
  the overflow case, but it would avoid adding code to nf_hook_slow et
  al. ]

Lets wait what others think.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Krishna Kumar May 23, 2012, 2:11 p.m. UTC | #4
Florian Westphal <fw@strlen.de> wrote on 05/23/2012 01:24:56 PM:

>
> > Maybe this needs a check for ESRCH and return back to hook_slow
> > to work correctly? If this is the case, I can submit a patch to
> > fix this, and piggy-back ESRCH for fail-open too.
>
> The idea for queue-bypass was to free the original (gso) skb
> if we were able to queue at least one packet, i.e. the original
> skb only continues traversal if queue bypassing is enabled
> and no single segment could be queued.
>
> If it is a requirement for you that any remaining segments
> that could not be queued continue traversal, then yes,
> the existing code won't work for you.

Yes, all segments need to be processed for this option. I will
check if it is possible to do any code-sharing, and post the
patches tomorrow with your other feedback incorporated.

Thanks for your review,
- KK

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -ruNp org/include/linux/netfilter/nfnetlink_queue.h new/include/linux/netfilter/nfnetlink_queue.h
--- org/include/linux/netfilter/nfnetlink_queue.h	2012-05-22 08:45:32.648606721 +0530
+++ new/include/linux/netfilter/nfnetlink_queue.h	2012-05-22 16:53:05.035405303 +0530
@@ -84,8 +84,13 @@  enum nfqnl_attr_config {
 	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
 	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
 	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
+	NFQA_CFG_MASK,			/* identify which flags to change */
+	NFQA_CFG_FLAGS,			/* value of these flags (__u32) */
 	__NFQA_CFG_MAX
 };
 #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
 
+/* Flags for NFQA_CFG_FLAGS */
+#define NFQA_CFG_F_FAIL_OPEN			(1 << 0)
+
 #endif /* _NFNETLINK_QUEUE_H */
diff -ruNp org/net/netfilter/core.c new/net/netfilter/core.c
--- org/net/netfilter/core.c	2012-05-22 08:45:32.651608253 +0530
+++ new/net/netfilter/core.c	2012-05-22 17:35:51.294216873 +0530
@@ -163,6 +163,31 @@  repeat:
 	return NF_ACCEPT;
 }
 
+/*
+ * Handler was not able to enqueue the packet, and returned ENOSPC
+ * since "fail-open" was enabled. We temporarily accept the skb, or
+ * each segment for a segmented skb, and then free up the header.
+ */
+static void handle_fail_open(struct sk_buff *skb,
+			     int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *segs;
+	bool gso;
+
+	segs = skb->next ? : skb;
+	gso = skb->next != NULL;
+
+	do {
+		struct sk_buff *nskb = segs->next;
+
+		segs->next = NULL;
+		okfn(segs);
+		segs = nskb;
+	} while (segs);
+
+	if (gso)
+		kfree_skb(skb);
+}
 
 /* Returns 1 if okfn() needs to be executed by the caller,
  * -EPERM for NF_DROP, 0 otherwise. */
@@ -174,6 +199,7 @@  int nf_hook_slow(u_int8_t pf, unsigned i
 {
 	struct list_head *elem;
 	unsigned int verdict;
+	int failopen = 0;
 	int ret = 0;
 
 	/* We may already have this, but read-locks nest anyway */
@@ -184,7 +210,8 @@  next_hook:
 	verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
 			     outdev, &elem, okfn, hook_thresh);
 	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
-		ret = 1;
+		if (!failopen) /* don't use the default verdict if 'failopen' */
+			ret = 1;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
 		kfree_skb(skb);
 		ret = NF_DROP_GETERR(verdict);
@@ -199,10 +226,18 @@  next_hook:
 			if (err == -ESRCH &&
 			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
 				goto next_hook;
+			if (err == -ENOSPC) {
+				failopen = 1;
+				goto next_hook;
+			}
 			kfree_skb(skb);
 		}
 	}
 	rcu_read_unlock();
+
+	if (!ret && failopen)
+		handle_fail_open(skb, okfn);
+
 	return ret;
 }
 EXPORT_SYMBOL(nf_hook_slow);
diff -ruNp org/net/netfilter/nfnetlink_queue.c new/net/netfilter/nfnetlink_queue.c
--- org/net/netfilter/nfnetlink_queue.c	2012-05-22 08:45:32.652606825 +0530
+++ new/net/netfilter/nfnetlink_queue.c	2012-05-22 16:51:21.876842922 +0530
@@ -52,6 +52,7 @@  struct nfqnl_instance {
 
 	u_int16_t queue_num;			/* number of this queue */
 	u_int8_t copy_mode;
+	u_int32_t flags;			/* Set using NFQA_CFG_FLAGS */
 /*
  * Following fields are dirtied for each queued packet,
  * keep them in same cache line if possible.
@@ -431,9 +432,14 @@  nfqnl_enqueue_packet(struct nf_queue_ent
 		goto err_out_free_nskb;
 	}
 	if (queue->queue_total >= queue->queue_maxlen) {
-		queue->queue_dropped++;
-		net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
-				     queue->queue_total);
+		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+			/* Accept the packet temporarily skipping rules */
+			err = -ENOSPC;
+		} else {
+			queue->queue_dropped++;
+			net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+					     queue->queue_total);
+		}
 		goto err_out_free_nskb;
 	}
 	entry->id = ++queue->id_sequence;
@@ -858,6 +864,31 @@  nfqnl_recv_config(struct sock *ctnl, str
 		spin_unlock_bh(&queue->lock);
 	}
 
+	if (nfqa[NFQA_CFG_FLAGS]) {
+		u_int32_t *flags, *mask;
+
+		if (!queue) {
+			ret = -ENODEV;
+			goto err_out_unlock;
+		}
+
+		if (!nfqa[NFQA_CFG_MASK]) {
+			/* A mask is needed to tell which flags are being
+			 * changed.
+			 * */
+			ret = -EINVAL;
+			goto err_out_unlock;
+		}
+
+		flags = nla_data(nfqa[NFQA_CFG_FLAGS]);
+		mask = nla_data(nfqa[NFQA_CFG_MASK]);
+
+		spin_lock_bh(&queue->lock);
+		queue->flags &= ~ntohl(*mask);
+		queue->flags |= ntohl(*flags) & ntohl(*mask);
+		spin_unlock_bh(&queue->lock);
+	}
+
 err_out_unlock:
 	rcu_read_unlock();
 	return ret;
diff -ruNp org/net/netfilter/nf_queue.c new/net/netfilter/nf_queue.c
--- org/net/netfilter/nf_queue.c	2012-05-22 08:45:32.649606572 +0530
+++ new/net/netfilter/nf_queue.c	2012-05-22 14:21:19.578299181 +0530
@@ -268,14 +268,23 @@  int nf_queue(struct sk_buff *skb,
 			err = __nf_queue(segs, elem, pf, hook, indev,
 					   outdev, okfn, queuenum);
 		}
-		if (err == 0)
+
+		if (err == 0) {
 			queued++;
-		else
+		} else if (err == -ENOSPC) {
+			/* Queue failed due to queue-full and handler is
+			 * in "fail-open" mode.
+			 */
+			segs->next = nskb;
+			skb->next = segs;
+			break;
+		} else {
 			kfree_skb(segs);
+		}
 		segs = nskb;
 	} while (segs);
 
-	if (queued) {
+	if (queued && err != -ENOSPC) {
 		kfree_skb(skb);
 		return 0;
 	}