@@ -71,6 +71,18 @@ struct nft_ebpf {
extern const struct nft_expr_ops nft_ebpf_fast_ops;
+struct nft_jit_data_from_user {
+ int ebpf_fd; /* fd to get program from, or < 0 if jitter error */
+ u32 expr_count; /* number of translated expressions */
+};
+
+#if IS_ENABLED(CONFIG_NF_TABLES_JIT)
+int nft_jit_commit(struct net *net);
+#else
+static inline int nft_jit_commit(struct net *net) { return 0; }
+#endif
+int nf_tables_jit_work(const struct sk_buff *nlskb, struct nft_ebpf *e);
+
extern struct static_key_false nft_counters_enabled;
extern struct static_key_false nft_trace_enabled;
@@ -473,6 +473,13 @@ config NF_TABLES_NETDEV
help
This option enables support for the "netdev" table.
+config NF_TABLES_JIT
+ bool "Netfilter nf_tables jit infrastructure"
+ depends on BPF
+ help
+ This option enables support for translation of nf_tables
+ expressions to ebpf.
+
config NFT_NUMGEN
tristate "Netfilter nf_tables number generator module"
help
@@ -76,8 +76,12 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
- nf_tables_jit.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
+
+obj-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/
+nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit.o
+nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/nf_tables_jit_kern.o
+nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/nf_tables_jit_umh.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
@@ -6092,6 +6092,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ int ret;
+
+ ret = nft_jit_commit(net);
+ if (ret < 0)
+ return ret;
/* 1. Allocate space for next generation rules_gen_X[] */
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
@@ -93,19 +93,46 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
return true;
}
+/* Dirty hack: pass nft_pktinfo in skb->cb[] */
+struct nft_jit_args_inet_cb {
+ /* cb[0] */
+ u16 thoff; /* 0: unset */
+ u16 lloff; /* 0: unset */
+
+ /* cb[1] */
+ u16 l4proto; /* thoff = 0? unset */
+ u16 reserved;
+
+ /* 12 bytes left */
+};
+
static void nft_ebpf_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_ebpf *priv = nft_expr_priv(expr);
+ struct nft_jit_args_inet_cb *jit_args;
struct bpf_skb_data_end cb_saved;
int ret;
+ BUILD_BUG_ON(sizeof(struct nft_jit_args_inet_cb) > QDISC_CB_PRIV_LEN);
+
memcpy(&cb_saved, pkt->skb->cb, sizeof(cb_saved));
+
+ jit_args = (void *)bpf_skb_cb(pkt->skb);
+ memset(jit_args, 0, sizeof(*jit_args));
+
+ if (skb_mac_header_was_set(pkt->skb))
+ jit_args->lloff = skb_mac_header_len(pkt->skb);
+
+ if (pkt->tprot_set) {
+ jit_args->thoff = pkt->xt.thoff;
+ jit_args->l4proto = pkt->tprot;
+ }
+
bpf_compute_data_pointers(pkt->skb);
ret = BPF_PROG_RUN(priv->prog, pkt->skb);
-
memcpy(pkt->skb->cb, &cb_saved, sizeof(cb_saved));
switch (ret) {
@@ -119,9 +146,9 @@ static void nft_ebpf_fast_eval(const struct nft_expr *expr,
default:
pr_debug("Unknown verdict %d\n", ret);
regs->verdict.code = NF_DROP;
- break;
}
}
+
DEFINE_STATIC_KEY_FALSE(nft_counters_enabled);
static noinline void nft_update_chain_stats(const struct nft_chain *chain,
@@ -1,13 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/filter.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+#include <linux/file.h>
+
+static int nft_jit_dump_ruleinfo(struct sk_buff *skb,
+ const struct nft_ctx *ctx, const struct nft_rule *rule)
+{
+ const struct nft_expr *expr, *next;
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *list;
+ int ret;
+ u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWRULE);
+
+ nlh = nlmsg_put(skb, ctx->portid, ctx->seq, type, sizeof(struct nfgenmsg), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
+
+ ret = nla_put_string(skb, NFTA_RULE_TABLE, ctx->table->name);
+ if (ret < 0)
+ return ret;
+ ret = nla_put_string(skb, NFTA_RULE_CHAIN, ctx->chain->name);
+ if (ret < 0)
+ return ret;
+ ret = nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle),
+ NFTA_RULE_PAD);
+ if (ret < 0)
+ return ret;
+
+ list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (list == NULL)
+ return -EMSGSIZE;
+
+ nft_rule_for_each_expr(expr, next, rule) {
+ ret = nft_expr_dump(skb, NFTA_LIST_ELEM, expr);
+ if (ret)
+ return ret;
+ }
+ nla_nest_end(skb, list);
+ nlmsg_end(skb, nlh);
+ return 0;
+}
struct nft_ebpf_expression {
struct nft_expr e;
struct nft_ebpf priv;
};
+static int nft_jit_rule(struct nft_trans *trans, struct sk_buff *skb)
+{
+ const struct nft_rule *r = nft_trans_rule(trans);
+ const struct nft_expr *e, *last;
+ struct nft_ebpf_expression ebpf = { 0 };
+ struct nft_rule *rule;
+ struct nft_expr *new;
+ unsigned int size = sizeof(ebpf);
+ int err, expr_count;
+
+ err = nft_jit_dump_ruleinfo(skb, &trans->ctx, nft_trans_rule(trans));
+ if (err < 0)
+ return err;
+
+ err = nf_tables_jit_work(skb, &ebpf.priv);
+ if (err < 0)
+ return err;
+
+ if (!ebpf.priv.prog)
+ return 0;
+
+ ebpf.priv.original = r;
+
+ if (r->udata) {
+ struct nft_userdata *udata = nft_userdata(r);
+
+ size += udata->len + 1;
+ }
+
+ rule = kmalloc(sizeof(*rule) + r->dlen + size, GFP_KERNEL);
+ if (!rule) {
+ bpf_prog_put(ebpf.priv.prog);
+ return -ENOMEM;
+ }
+
+ memcpy(rule, r, sizeof(*r));
+ rule->dlen = r->dlen + sizeof(ebpf);
+
+ new = nft_expr_first(rule);
+ memcpy(new, &ebpf, sizeof(ebpf));
+ new->ops = &nft_ebpf_fast_ops;
+ size = sizeof(ebpf);
+
+ expr_count = 0;
+ nft_rule_for_each_expr(e, last, r) {
+ ++expr_count;
+ if (expr_count <= ebpf.priv.expressions)
+ continue; /* expression was jitted */
+
+ new = nft_expr_next(new);
+ memcpy(new, e, e->ops->size);
+ size += e->ops->size;
+ }
+
+ rule->dlen = size;
+ if (r->udata) {
+ const struct nft_userdata *udata = nft_userdata(r);
+
+ memcpy(nft_userdata(rule), udata, udata->len + 1);
+ }
+
+ list_replace_rcu(&nft_trans_rule(trans)->list, &rule->list);
+ nft_trans_rule(trans) = rule;
+
+ return 0;
+}
+
+int nft_jit_commit(struct net *net)
+{
+ struct nft_trans *trans;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWRULE)
+ continue;
+
+ ret = nft_jit_rule(trans, skb);
+ if (ret < 0)
+ break;
+ skb->head = skb->data;
+ skb_reset_tail_pointer(skb);
+ }
+
+ kfree_skb(skb);
+ return ret;
+}
+
static const struct nla_policy nft_ebpf_policy[NFTA_EBPF_MAX + 1] = {
[NFTA_EBPF_FD] = { .type = NLA_S32 },
[NFTA_EBPF_ID] = { .type = NLA_U32 },
new file mode 100644
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+
+hostprogs-y := nf_tables_jit_umh
+nf_tables_jit_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/
+
+quiet_cmd_copy_umh = GEN $@
+ cmd_copy_umh = echo ':' > $(obj)/.nf_tables_jit_umh.o.cmd; \
+ $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+ -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+ --rename-section .data=.rodata $< $@
+
+$(obj)/nf_tables_jit_umh.o: $(obj)/nf_tables_jit_umh
+ $(call cmd,copy_umh)
+
+obj-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit.o
+nf_tables_jit-objs += nf_tables_jit_kern.o nf_tables_jit_umh.o
new file mode 100644
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <unistd.h>
+
+int main(void)
+{
+ static struct {
+ int fd, count;
+ } response;
+
+ response.fd = -1;
+ for (;;) {
+ char buf[8192];
+
+ if (read(0, buf, sizeof(buf)) < 0)
+ return 1;
+ if (write(1, &response, sizeof(response)) != sizeof(response))
+ return 2;
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/umh.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+#define UMH_start _binary_net_netfilter_nf_tables_jit_nf_tables_jit_umh_start
+#define UMH_end _binary_net_netfilter_nf_tables_jit_nf_tables_jit_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+
+static int nft_jit_load_umh(void)
+{
+ return fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+}
+
+int nf_tables_jit_work(const struct sk_buff *nlskb, struct nft_ebpf *e)
+{
+ if (!info.pipe_to_umh) {
+ int ret = nft_jit_load_umh();
+ if (ret)
+ return ret;
+
+ if (WARN_ON(!info.pipe_to_umh))
+ return -EINVAL;
+ }
+
+ return 0;
+}
This adds a JIT helper infrastructure to translate nft expressions to ebpf programs. From commit phase, we spawn jit module (a userspace program), and then provide the rules that came in this transaction to that program via a pipe (in nf_tables netlink format). The userspace helper translates the rules if possible, and installs the program(s) via bpf syscall. For each rule a small response containing the corresponding file descriptor (can be -1 on failure) and a attribute count (how many expressions were jitted) gets sent back to kernel via pipe. If translation fails, the rule is will be processed by nf_tables interpreter (as before this patch). If translation succeeded, nf_tables fetches the bpf program using the file descriptor identifier, allocates a new rule blob containing the new 'ebpf' expression (and possible trailing un-translated expressions). It then replaces the original rule in the transaction log with the new 'ebpf-rule'. The original rule is retained in a private area inside the epbf expression to be able to present the original expressions to userspace when 'nft list ruleset' is called. For easier review, this contains the kernel-side only. nf_tables_jit_work() will not do anything, yet. Unresolved issues: - maps and sets. It might be possible to add a new ebpf map type that just wraps the nft set infrastructure for lookups. This would allow nft userspace to continue to work as-is while not requiring new ebpf helper. - we should eventually support translating multiple (adjacent) rules into single program. If we do this kernel will need to track mapping of rules to program (to re-jit when a rule is changed. This isn't implemented so far, but can be added later. We will also need to dump the 'next' generation of the to-be-translated table. The kernel has this information, so its only a matter of serializing it back to userspace from the commit phase. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/net/netfilter/nf_tables_core.h | 12 ++ net/netfilter/Kconfig | 7 ++ net/netfilter/Makefile | 8 +- net/netfilter/nf_tables_api.c | 5 + net/netfilter/nf_tables_core.c | 31 ++++- net/netfilter/nf_tables_jit.c | 139 +++++++++++++++++++++++ net/netfilter/nf_tables_jit/Makefile | 18 +++ net/netfilter/nf_tables_jit/main.c | 21 ++++ net/netfilter/nf_tables_jit/nf_tables_jit_kern.c | 33 ++++++ 9 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 net/netfilter/nf_tables_jit/Makefile create mode 100644 net/netfilter/nf_tables_jit/main.c create mode 100644 net/netfilter/nf_tables_jit/nf_tables_jit_kern.c