Message ID | 20120522121048.880.22605.sendpatchset@localhost.localdomain |
---|---|
State | Superseded |
Headers | show |
Krishna Kumar <krkumar2@in.ibm.com> wrote: > Implement a new "fail-open" mode where packets are not dropped > upon queue-full condition. This mode can be individually enabled > or disabled per queue using netlink NFAQ_CFG_FLAGS & NFAQ_CFG_MASK > attributes. > > NFQA_CFG_QUEUE_MAXLEN, /* __u32 */ > + NFQA_CFG_MASK, /* identify which flags to change */ > + NFQA_CFG_FLAGS, /* value of these flags (__u32) */ __be32? I see that QUEUE_MAXLEN gets ntohl treatment, too.... > __NFQA_CFG_MAX > }; > #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) > > +/* Flags for NFQA_CFG_FLAGS */ > +#define NFQA_CFG_F_FAIL_OPEN (1 << 0) > + > #endif /* _NFNETLINK_QUEUE_H */ > diff -ruNp org/net/netfilter/core.c new/net/netfilter/core.c > --- org/net/netfilter/core.c 2012-05-22 08:45:32.651608253 +0530 > +++ new/net/netfilter/core.c 2012-05-22 17:35:51.294216873 +0530 > @@ -163,6 +163,31 @@ repeat: > return NF_ACCEPT; > } > > +/* > + * Handler was not able to enqueue the packet, and returned ENOSPC > + * since "fail-open" was enabled. We temporarily accept the skb, or > + * each segment for a segmented skb, and then free up the header. > + */ > +static void handle_fail_open(struct sk_buff *skb, > + int (*okfn)(struct sk_buff *)) > +{ > + struct sk_buff *segs; > + bool gso; > + > + segs = skb->next ? : skb; > + gso = skb->next != NULL; > + > + do { > + struct sk_buff *nskb = segs->next; > + > + segs->next = NULL; > + okfn(segs); > + segs = nskb; > + } while (segs); > + > + if (gso) > + kfree_skb(skb); > +} I don't understand why this is needed at all. Conceptually, what you're doing is identical to the --nfqueue-bypass feature, so it should be enough to change > @@ -199,10 +226,18 @@ next_hook: > if (err == -ESRCH && > (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) > goto next_hook; > + if (err == -ENOSPC) { > + failopen = 1; > + goto next_hook; > + } > kfree_skb(skb); to if (err == -ENOSPC || (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; [ or, take advantage of the existing -ECANCELED and have the queueing backend return that if queue is full and fail-open is enabled ] Yes, that means that if the userspace ruleset is -j NFQUEUE -j DROP then your packets will be dropped even if the userspace application enables failopen. But thats a feature, since you could also do -j NFQUEUE -m limt ... -j LOG --log-prefix "queue overflow" or play extra games wrt. "drop if established, CONNMARK for future bypass if --ctstate NEW", etc. If its a requirment for you that userspace can force ACCEPT regardless of ruleset, then perhaps it might be better to add a separate "default verdict" option and invoke nf_reinject() directly from the qeueueing backend instead of passing the fail-open information back to nf_hook_slow? > + flags = nla_data(nfqa[NFQA_CFG_FLAGS]); > + mask = nla_data(nfqa[NFQA_CFG_MASK]); nla_get_be32()? [ _u32 would make more sense to me, but other attributes are be32 too, so I'm ok with it ] > - if (err == 0) > + > + if (err == 0) { > queued++; > - else > + } else if (err == -ENOSPC) { > + /* Queue failed due to queue-full and handler is > + * in "fail-open" mode. > + */ > + segs->next = nskb; > + skb->next = segs; > + break; > + } else { > kfree_skb(segs); > + } > segs = nskb; > } while (segs); > > - if (queued) { > + if (queued && err != -ENOSPC) { > kfree_skb(skb); > return 0; > } Similarily, this shouldn't be needed either any more since you no longer need to check for -ENOSPC (existing --queue-bypass behaviour should handle your case, too). -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Florian Westphal <fw@strlen.de> wrote on 05/22/2012 08:08:58 PM: Thanks for your review. > > > > NFQA_CFG_QUEUE_MAXLEN, /* __u32 */ > > + NFQA_CFG_MASK, /* identify which flags to change */ > > + NFQA_CFG_FLAGS, /* value of these flags (__u32) */ > > __be32? Yes, I will change this. > I see that QUEUE_MAXLEN gets ntohl treatment, too.... > > > +static void handle_fail_open(struct sk_buff *skb, > > + int (*okfn)(struct sk_buff *)) > > +{ > > + struct sk_buff *segs; > > + bool gso; > > + > > + segs = skb->next ? : skb; > > + gso = skb->next != NULL; > > + > > + do { > > + struct sk_buff *nskb = segs->next; > > + > > + segs->next = NULL; > > + okfn(segs); > > + segs = nskb; > > + } while (segs); > > + > > + if (gso) > > + kfree_skb(skb); > > +} > > I don't understand why this is needed at all. > Conceptually, what you're doing is identical to the > --nfqueue-bypass feature, so it should be enough to change Yes, I could use the same code. However I am not clear about the ESRCH path since it doesn't seem to handle GSO skb correctly? See nf_queue below where it calls kfree_skb even for -ESRCH: if (err == 0) { nf_bridge_adjust_segmented_data(segs); err = __nf_queue(segs, elem, pf, hook, indev, outdev, okfn, queuenum); } if (err == 0) queued++; else kfree_skb(segs); Maybe this needs a check for ESRCH and return back to hook_slow to work correctly? If this is the case, I can submit a patch to fix this, and piggy-back ESRCH for fail-open too. > > @@ -199,10 +226,18 @@ next_hook: > > if (err == -ESRCH && > > (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) > > goto next_hook; > > + if (err == -ENOSPC) { > > + failopen = 1; > > + goto next_hook; > > + } > > kfree_skb(skb); > > to > > if (err == -ENOSPC || (err == -ESRCH && (verdict & > NF_VERDICT_FLAG_QUEUE_BYPASS)) > goto next_hook; This would work if the skb was not GSO. So maybe the above fix for ESRCH will let me share the code for fail-open too? > [ or, take advantage of the existing -ECANCELED and have the queueing > backend return that if queue is full and fail-open is enabled ] > > Yes, that means that if the userspace ruleset is > -j NFQUEUE > -j DROP > > then your packets will be dropped even if the userspace application > enables failopen. > > But thats a feature, since you could also do > -j NFQUEUE > -m limt ... -j LOG --log-prefix "queue overflow" > > or play extra games wrt. "drop if established, CONNMARK > for future bypass if --ctstate NEW", etc. > > If its a requirment for you that userspace can force ACCEPT > regardless of ruleset, then perhaps it might be better to > add a separate "default verdict" option and invoke > nf_reinject() directly from the qeueueing backend instead > of passing the fail-open information back to nf_hook_slow? > > > + flags = nla_data(nfqa[NFQA_CFG_FLAGS]); > > + mask = nla_data(nfqa[NFQA_CFG_MASK]); > > nla_get_be32()? > [ _u32 would make more sense to me, but other attributes are be32 too, > so I'm ok with it ] Yes, I will change to be32. > > - if (err == 0) > > + > > + if (err == 0) { > > queued++; > > - else > > + } else if (err == -ENOSPC) { > > + /* Queue failed due to queue-full and handler is > > + * in "fail-open" mode. > > + */ > > + segs->next = nskb; > > + skb->next = segs; > > + break; > > + } else { > > kfree_skb(segs); > > + } > > segs = nskb; > > } while (segs); > > > > - if (queued) { > > + if (queued && err != -ENOSPC) { > > kfree_skb(skb); > > return 0; > > } > > Similarily, this shouldn't be needed either any more since you > no longer need to check for -ENOSPC (existing --queue-bypass behaviour > should handle your case, too). Slight change could be needed here, since queued is 0 for --queue-bypass case (and hence return -ESRCH), while it could be >0 for fail-open, where we still want to return error as some segments were not queued. Thanks, - KK -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Krishna Kumar2 <krkumar2@in.ibm.com> wrote: > Florian Westphal <fw@strlen.de> wrote on 05/22/2012 08:08:58 PM: > > I don't understand why this is needed at all. > > Conceptually, what you're doing is identical to the > > --nfqueue-bypass feature, so it should be enough to change > > Yes, I could use the same code. However I am not clear about the > ESRCH path since it doesn't seem to handle GSO skb correctly? See > nf_queue below where it calls kfree_skb even for -ESRCH: > > if (err == 0) { > nf_bridge_adjust_segmented_data(segs); > err = __nf_queue(segs, elem, pf, hook, indev, > outdev, okfn, queuenum); > } > if (err == 0) > queued++; > else > kfree_skb(segs); > > Maybe this needs a check for ESRCH and return back to hook_slow > to work correctly? If this is the case, I can submit a patch to > fix this, and piggy-back ESRCH for fail-open too. The idea for queue-bypass was to free the original (gso) skb if we were able to queue at least one packet, i.e. the original skb only continues traversal if queue bypassing is enabled and no single segment could be queued. If it is a requirement for you that any remaining segments that could not be queued continue traversal, then yes, the existing code won't work for you. I don't think its necessary to piggyback ESRCH too; since userspace should not bind/unbind the queue continuously (i.e., a -ESRCH after some segments have been queued should be a rare condition). However, if the code sharing is not too much of a burden then it would be good to do it anyway. > > > @@ -199,10 +226,18 @@ next_hook: > > > if (err == -ESRCH && > > > (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) > > > goto next_hook; > > > + if (err == -ENOSPC) { > > > + failopen = 1; > > > + goto next_hook; > > > + } > > > kfree_skb(skb); > > > > to > > > > if (err == -ENOSPC || (err == -ESRCH && (verdict & > > NF_VERDICT_FLAG_QUEUE_BYPASS)) > > goto next_hook; > > This would work if the skb was not GSO. So maybe the above > fix for ESRCH will let me share the code for fail-open too? See above; it works for GSO too if we could not queue a single segment. I think that iff you need to handle the "some segments queued" case and put that in here then handling -ESRCH with it too would be good. > > [ or, take advantage of the existing -ECANCELED and have the queueing > > backend return that if queue is full and fail-open is enabled ] > > > > Yes, that means that if the userspace ruleset is > > -j NFQUEUE > > -j DROP > > > > then your packets will be dropped even if the userspace application > > enables failopen. > > > > But thats a feature, since you could also do > > -j NFQUEUE > > -m limt ... -j LOG --log-prefix "queue overflow" > > > > or play extra games wrt. "drop if established, CONNMARK > > for future bypass if --ctstate NEW", etc. > > > > If its a requirment for you that userspace can force ACCEPT > > regardless of ruleset, then perhaps it might be better to > > add a separate "default verdict" option and invoke > > nf_reinject() directly from the qeueueing backend instead > > of passing the fail-open information back to nf_hook_slow? > > > > > + flags = nla_data(nfqa[NFQA_CFG_FLAGS]); > > > + mask = nla_data(nfqa[NFQA_CFG_MASK]); > > > > nla_get_be32()? > > [ _u32 would make more sense to me, but other attributes are be32 too, > > so I'm ok with it ] > > Yes, I will change to be32. > > > > - if (err == 0) > > > + > > > + if (err == 0) { > > > queued++; > > > - else > > > + } else if (err == -ENOSPC) { > > > + /* Queue failed due to queue-full and handler is > > > + * in "fail-open" mode. > > > + */ > > > + segs->next = nskb; > > > + skb->next = segs; > > > + break; > > > + } else { > > > kfree_skb(segs); > > > + } > > > segs = nskb; > > > } while (segs); > > > > > > - if (queued) { > > > + if (queued && err != -ENOSPC) { > > > kfree_skb(skb); > > > return 0; > > > } > > > > Similarily, this shouldn't be needed either any more since you > > no longer need to check for -ENOSPC (existing --queue-bypass behaviour > > should handle your case, too). > > Slight change could be needed here, since queued is 0 for --queue-bypass > case (and hence return -ESRCH), while it could be >0 for fail-open, > where we still want to return error as some segments were not queued. Right, queued will almost certainly be 0 if the userspace listener is not there anymore. I wonder if it would be possible to just nf_reinject() directly from nfqnl_enqueue_packet(). [ This means we loose the ability to add rules after NFQUEUE to detect the overflow case, but it would avoid adding code to nf_hook_slow et al. ] Lets wait what others think. -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Florian Westphal <fw@strlen.de> wrote on 05/23/2012 01:24:56 PM: > > > Maybe this needs a check for ESRCH and return back to hook_slow > > to work correctly? If this is the case, I can submit a patch to > > fix this, and piggy-back ESRCH for fail-open too. > > The idea for queue-bypass was to free the original (gso) skb > if we were able to queue at least one packet, i.e. the original > skb only continues traversal if queue bypassing is enabled > and no single segment could be queued. > > If it is a requirement for you that any remaining segments > that could not be queued continue traversal, then yes, > the existing code won't work for you. Yes, all segments need to be processed for this option. I will check if it is possible to do any code-sharing, and post the patches tomorrow with your other feedback incorporated. Thanks for your review, - KK -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff -ruNp org/include/linux/netfilter/nfnetlink_queue.h new/include/linux/netfilter/nfnetlink_queue.h --- org/include/linux/netfilter/nfnetlink_queue.h 2012-05-22 08:45:32.648606721 +0530 +++ new/include/linux/netfilter/nfnetlink_queue.h 2012-05-22 16:53:05.035405303 +0530 @@ -84,8 +84,13 @@ enum nfqnl_attr_config { NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ NFQA_CFG_QUEUE_MAXLEN, /* __u32 */ + NFQA_CFG_MASK, /* identify which flags to change */ + NFQA_CFG_FLAGS, /* value of these flags (__u32) */ __NFQA_CFG_MAX }; #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) +/* Flags for NFQA_CFG_FLAGS */ +#define NFQA_CFG_F_FAIL_OPEN (1 << 0) + #endif /* _NFNETLINK_QUEUE_H */ diff -ruNp org/net/netfilter/core.c new/net/netfilter/core.c --- org/net/netfilter/core.c 2012-05-22 08:45:32.651608253 +0530 +++ new/net/netfilter/core.c 2012-05-22 17:35:51.294216873 +0530 @@ -163,6 +163,31 @@ repeat: return NF_ACCEPT; } +/* + * Handler was not able to enqueue the packet, and returned ENOSPC + * since "fail-open" was enabled. We temporarily accept the skb, or + * each segment for a segmented skb, and then free up the header. + */ +static void handle_fail_open(struct sk_buff *skb, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *segs; + bool gso; + + segs = skb->next ? : skb; + gso = skb->next != NULL; + + do { + struct sk_buff *nskb = segs->next; + + segs->next = NULL; + okfn(segs); + segs = nskb; + } while (segs); + + if (gso) + kfree_skb(skb); +} /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ @@ -174,6 +199,7 @@ int nf_hook_slow(u_int8_t pf, unsigned i { struct list_head *elem; unsigned int verdict; + int failopen = 0; int ret = 0; /* We may already have this, but read-locks nest anyway */ @@ -184,7 +210,8 @@ next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { - ret = 1; + if (!failopen) /* don't use the default verdict if 'failopen' */ + ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { kfree_skb(skb); ret = NF_DROP_GETERR(verdict); @@ -199,10 +226,18 @@ next_hook: if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; + if (err == -ENOSPC) { + failopen = 1; + goto next_hook; + } kfree_skb(skb); } } rcu_read_unlock(); + + if (!ret && failopen) + handle_fail_open(skb, okfn); + return ret; } EXPORT_SYMBOL(nf_hook_slow); diff -ruNp org/net/netfilter/nfnetlink_queue.c new/net/netfilter/nfnetlink_queue.c --- org/net/netfilter/nfnetlink_queue.c 2012-05-22 08:45:32.652606825 +0530 +++ new/net/netfilter/nfnetlink_queue.c 2012-05-22 16:51:21.876842922 +0530 @@ -52,6 +52,7 @@ struct nfqnl_instance { u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; + u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. @@ -431,9 +432,14 @@ nfqnl_enqueue_packet(struct nf_queue_ent goto err_out_free_nskb; } if (queue->queue_total >= queue->queue_maxlen) { - queue->queue_dropped++; - net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", - queue->queue_total); + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + /* Accept the packet temporarily skipping rules */ + err = -ENOSPC; + } else { + queue->queue_dropped++; + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; @@ -858,6 +864,31 @@ nfqnl_recv_config(struct sock *ctnl, str spin_unlock_bh(&queue->lock); } + if (nfqa[NFQA_CFG_FLAGS]) { + u_int32_t *flags, *mask; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to tell which flags are being + * changed. + * */ + ret = -EINVAL; + goto err_out_unlock; + } + + flags = nla_data(nfqa[NFQA_CFG_FLAGS]); + mask = nla_data(nfqa[NFQA_CFG_MASK]); + + spin_lock_bh(&queue->lock); + queue->flags &= ~ntohl(*mask); + queue->flags |= ntohl(*flags) & ntohl(*mask); + spin_unlock_bh(&queue->lock); + } + err_out_unlock: rcu_read_unlock(); return ret; diff -ruNp org/net/netfilter/nf_queue.c new/net/netfilter/nf_queue.c --- org/net/netfilter/nf_queue.c 2012-05-22 08:45:32.649606572 +0530 +++ new/net/netfilter/nf_queue.c 2012-05-22 14:21:19.578299181 +0530 @@ -268,14 +268,23 @@ int nf_queue(struct sk_buff *skb, err = __nf_queue(segs, elem, pf, hook, indev, outdev, okfn, queuenum); } - if (err == 0) + + if (err == 0) { queued++; - else + } else if (err == -ENOSPC) { + /* Queue failed due to queue-full and handler is + * in "fail-open" mode. + */ + segs->next = nskb; + skb->next = segs; + break; + } else { kfree_skb(segs); + } segs = nskb; } while (segs); - if (queued) { + if (queued && err != -ENOSPC) { kfree_skb(skb); return 0; }