diff mbox

[v13,net-next,07/11] bpf: verifier (add ability to receive verification log)

Message ID 1410914370-29883-8-git-send-email-ast@plumgrid.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov Sept. 17, 2014, 12:39 a.m. UTC
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
    struct {
	...
	__u32         log_level; /* verbosity level of eBPF verifier */
	__u32         log_size;  /* size of user buffer */
	__aligned_u64 log_buf;   /* user supplied 'char *buffer' */
    };
};

when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.

'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:

  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
  BPF_LD_MAP_FD(BPF_REG_1, 0),
  BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
  BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
  BPF_EXIT_INSN(),

will be rejected with the following multi-line message in log_buf:

  0: (7a) *(u64 *)(r10 -8) = 0
  1: (bf) r2 = r10
  2: (07) r2 += -8
  3: (b7) r1 = 0
  4: (85) call 1
  5: (15) if r0 == 0x0 goto pc+1
   R0=map_ptr R10=fp
  6: (7a) *(u64 *)(r0 +4) = 0
  misaligned access off 4 size 8

The format of the output can change at any time as verifier evolves.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    3 +
 kernel/bpf/syscall.c     |    2 +-
 kernel/bpf/verifier.c    |  235 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 239 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann Sept. 17, 2014, 6:51 a.m. UTC | #1
On 09/17/2014 02:39 AM, Alexei Starovoitov wrote:
> add optional attributes for BPF_PROG_LOAD syscall:
> union bpf_attr {
>      struct {
> 	...
> 	__u32         log_level; /* verbosity level of eBPF verifier */
> 	__u32         log_size;  /* size of user buffer */
> 	__aligned_u64 log_buf;   /* user supplied 'char *buffer' */
>      };
> };
>
> when log_level > 0 the verifier will return its verification log in the user
> supplied buffer 'log_buf' which can be used by program author to analyze why
> verifier rejected given program.
>
> 'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
> provides several examples of these messages, like the program:
>
>    BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
>    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
>    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
>    BPF_LD_MAP_FD(BPF_REG_1, 0),
>    BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
>    BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
>    BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
>    BPF_EXIT_INSN(),
>
> will be rejected with the following multi-line message in log_buf:
>
>    0: (7a) *(u64 *)(r10 -8) = 0
>    1: (bf) r2 = r10
>    2: (07) r2 += -8
>    3: (b7) r1 = 0
>    4: (85) call 1
>    5: (15) if r0 == 0x0 goto pc+1
>     R0=map_ptr R10=fp
>    6: (7a) *(u64 *)(r0 +4) = 0
>    misaligned access off 4 size 8
>
> The format of the output can change at any time as verifier evolves.
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> ---
>   include/uapi/linux/bpf.h |    3 +
>   kernel/bpf/syscall.c     |    2 +-
>   kernel/bpf/verifier.c    |  235 ++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 239 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 424f442016e7..31b0ac208a52 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -138,6 +138,9 @@ union bpf_attr {
>   		__u32		insn_cnt;
>   		__aligned_u64	insns;
>   		__aligned_u64	license;
> +		__u32		log_level;	/* verbosity level of verifier */
> +		__u32		log_size;	/* size of user buffer */
> +		__aligned_u64	log_buf;	/* user supplied buffer */
>   	};
>   } __attribute__((aligned(8)));
>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 67b5e29f183e..c7be7163bd11 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
>   }
>
>   /* last field in 'union bpf_attr' used by this command */
> -#define	BPF_PROG_LOAD_LAST_FIELD license
> +#define	BPF_PROG_LOAD_LAST_FIELD log_buf

I was looking to find a use case for this item, but couldn't find anything, so
this seems to be dead code?

Was it, so that each time you extend an uapi structure like above that you would
only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not work for
old binaries using this ABI running on newer kernels where there are different
expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of compilation.

>   static int bpf_prog_load(union bpf_attr *attr)
>   {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d6f9c3d6b4d7..871edc1f2e1f 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -125,9 +125,244 @@
>    * are set to NOT_INIT to indicate that they are no longer readable.
>    */
>
> +/* single container for all structs
> + * one verifier_env per bpf_check() call
> + */
> +struct verifier_env {
> +};
> +
> +/* verbose verifier prints what it's seeing
> + * bpf_check() is called under lock, so no race to access these global vars
> + */
> +static u32 log_level, log_size, log_len;
> +static char *log_buf;
> +
> +static DEFINE_MUTEX(bpf_verifier_lock);
> +
> +/* log_level controls verbosity level of eBPF verifier.
> + * verbose() is used to dump the verification trace to the log, so the user
> + * can figure out what's wrong with the program
> + */
> +static void verbose(const char *fmt, ...)
> +{
> +	va_list args;
> +
> +	if (log_level == 0 || log_len >= log_size - 1)
> +		return;
> +
> +	va_start(args, fmt);
> +	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
> +	va_end(args);
> +}
> +
> +static const char *const bpf_class_string[] = {
> +	[BPF_LD]    = "ld",
> +	[BPF_LDX]   = "ldx",
> +	[BPF_ST]    = "st",
> +	[BPF_STX]   = "stx",
> +	[BPF_ALU]   = "alu",
> +	[BPF_JMP]   = "jmp",
> +	[BPF_RET]   = "BUG",
> +	[BPF_ALU64] = "alu64",
> +};
> +
> +static const char *const bpf_alu_string[] = {
> +	[BPF_ADD >> 4]  = "+=",
> +	[BPF_SUB >> 4]  = "-=",
> +	[BPF_MUL >> 4]  = "*=",
> +	[BPF_DIV >> 4]  = "/=",
> +	[BPF_OR  >> 4]  = "|=",
> +	[BPF_AND >> 4]  = "&=",
> +	[BPF_LSH >> 4]  = "<<=",
> +	[BPF_RSH >> 4]  = ">>=",
> +	[BPF_NEG >> 4]  = "neg",
> +	[BPF_MOD >> 4]  = "%=",
> +	[BPF_XOR >> 4]  = "^=",
> +	[BPF_MOV >> 4]  = "=",
> +	[BPF_ARSH >> 4] = "s>>=",
> +	[BPF_END >> 4]  = "endian",
> +};
> +
> +static const char *const bpf_ldst_string[] = {
> +	[BPF_W >> 3]  = "u32",
> +	[BPF_H >> 3]  = "u16",
> +	[BPF_B >> 3]  = "u8",
> +	[BPF_DW >> 3] = "u64",
> +};
> +
> +static const char *const bpf_jmp_string[] = {
> +	[BPF_JA >> 4]   = "jmp",
> +	[BPF_JEQ >> 4]  = "==",
> +	[BPF_JGT >> 4]  = ">",
> +	[BPF_JGE >> 4]  = ">=",
> +	[BPF_JSET >> 4] = "&",
> +	[BPF_JNE >> 4]  = "!=",
> +	[BPF_JSGT >> 4] = "s>",
> +	[BPF_JSGE >> 4] = "s>=",
> +	[BPF_CALL >> 4] = "call",
> +	[BPF_EXIT >> 4] = "exit",
> +};
> +
> +static void print_bpf_insn(struct bpf_insn *insn)
> +{
> +	u8 class = BPF_CLASS(insn->code);
> +
> +	if (class == BPF_ALU || class == BPF_ALU64) {
> +		if (BPF_SRC(insn->code) == BPF_X)
> +			verbose("(%02x) %sr%d %s %sr%d\n",
> +				insn->code, class == BPF_ALU ? "(u32) " : "",
> +				insn->dst_reg,
> +				bpf_alu_string[BPF_OP(insn->code) >> 4],
> +				class == BPF_ALU ? "(u32) " : "",
> +				insn->src_reg);
> +		else
> +			verbose("(%02x) %sr%d %s %s%d\n",
> +				insn->code, class == BPF_ALU ? "(u32) " : "",
> +				insn->dst_reg,
> +				bpf_alu_string[BPF_OP(insn->code) >> 4],
> +				class == BPF_ALU ? "(u32) " : "",
> +				insn->imm);
> +	} else if (class == BPF_STX) {
> +		if (BPF_MODE(insn->code) == BPF_MEM)
> +			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
> +				insn->code,
> +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +				insn->dst_reg,
> +				insn->off, insn->src_reg);
> +		else if (BPF_MODE(insn->code) == BPF_XADD)
> +			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
> +				insn->code,
> +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +				insn->dst_reg, insn->off,
> +				insn->src_reg);
> +		else
> +			verbose("BUG_%02x\n", insn->code);
> +	} else if (class == BPF_ST) {
> +		if (BPF_MODE(insn->code) != BPF_MEM) {
> +			verbose("BUG_st_%02x\n", insn->code);
> +			return;
> +		}
> +		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
> +			insn->code,
> +			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +			insn->dst_reg,
> +			insn->off, insn->imm);
> +	} else if (class == BPF_LDX) {
> +		if (BPF_MODE(insn->code) != BPF_MEM) {
> +			verbose("BUG_ldx_%02x\n", insn->code);
> +			return;
> +		}
> +		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
> +			insn->code, insn->dst_reg,
> +			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +			insn->src_reg, insn->off);
> +	} else if (class == BPF_LD) {
> +		if (BPF_MODE(insn->code) == BPF_ABS) {
> +			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
> +				insn->code,
> +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +				insn->imm);
> +		} else if (BPF_MODE(insn->code) == BPF_IND) {
> +			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
> +				insn->code,
> +				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
> +				insn->src_reg, insn->imm);
> +		} else if (BPF_MODE(insn->code) == BPF_IMM) {
> +			verbose("(%02x) r%d = 0x%x\n",
> +				insn->code, insn->dst_reg, insn->imm);
> +		} else {
> +			verbose("BUG_ld_%02x\n", insn->code);
> +			return;
> +		}
> +	} else if (class == BPF_JMP) {
> +		u8 opcode = BPF_OP(insn->code);
> +
> +		if (opcode == BPF_CALL) {
> +			verbose("(%02x) call %d\n", insn->code, insn->imm);
> +		} else if (insn->code == (BPF_JMP | BPF_JA)) {
> +			verbose("(%02x) goto pc%+d\n",
> +				insn->code, insn->off);
> +		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
> +			verbose("(%02x) exit\n", insn->code);
> +		} else if (BPF_SRC(insn->code) == BPF_X) {
> +			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
> +				insn->code, insn->dst_reg,
> +				bpf_jmp_string[BPF_OP(insn->code) >> 4],
> +				insn->src_reg, insn->off);
> +		} else {
> +			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
> +				insn->code, insn->dst_reg,
> +				bpf_jmp_string[BPF_OP(insn->code) >> 4],
> +				insn->imm, insn->off);
> +		}
> +	} else {
> +		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
> +	}
> +}
> +
>   int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
>   {
> +	char __user *log_ubuf = NULL;
> +	struct verifier_env *env;
>   	int ret = -EINVAL;
>
> +	if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
> +		return -E2BIG;
> +
> +	/* 'struct verifier_env' can be global, but since it's not small,
> +	 * allocate/free it every time bpf_check() is called
> +	 */
> +	env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
> +	if (!env)
> +		return -ENOMEM;
> +
> +	/* grab the mutex to protect few globals used by verifier */
> +	mutex_lock(&bpf_verifier_lock);

So only because of the verifier error log (which are global vars here) we
now have to hold a eBPF-related mutex lock each time when attaching a program?

Also, if you really have to do the verifier error log, can't we spare ourself
most part of the textifying parts if you would encode the verifier log into a
normal structure array with eBPF specific error codes and then do all this
pretty printing in user space? Why is that impossible? I really think it's odd.

> +	if (attr->log_level || attr->log_buf || attr->log_size) {
> +		/* user requested verbose verifier output
> +		 * and supplied buffer to store the verification trace
> +		 */
> +		log_level = attr->log_level;
> +		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
> +		log_size = attr->log_size;
> +		log_len = 0;
> +
> +		ret = -EINVAL;
> +		/* log_* values have to be sane */
> +		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
> +		    log_level == 0 || log_ubuf == NULL)
> +			goto free_env;
> +
> +		ret = -ENOMEM;
> +		log_buf = vmalloc(log_size);
> +		if (!log_buf)
> +			goto free_env;
> +	} else {
> +		log_level = 0;
> +	}
> +
> +	/* ret = do_check(env); */
> +
> +	if (log_level && log_len >= log_size - 1) {
> +		BUG_ON(log_len >= log_size);
> +		/* verifier log exceeded user supplied buffer */
> +		ret = -ENOSPC;
> +		/* fall through to return what was recorded */
> +	}
> +
> +	/* copy verifier log back to user space including trailing zero */
> +	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
> +		ret = -EFAULT;
> +		goto free_log_buf;
> +	}
> +
> +
> +free_log_buf:
> +	if (log_level)
> +		vfree(log_buf);
> +free_env:
> +	kfree(env);
> +	mutex_unlock(&bpf_verifier_lock);
>   	return ret;
>   }
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 17, 2014, 4:08 p.m. UTC | #2
On Tue, Sep 16, 2014 at 11:51 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
>>
>>   /* last field in 'union bpf_attr' used by this command */
>> -#define        BPF_PROG_LOAD_LAST_FIELD license
>> +#define        BPF_PROG_LOAD_LAST_FIELD log_buf
>
>
> I was looking to find a use case for this item, but couldn't find anything,
> so
> this seems to be dead code?

See CHECK_ATTR() macro and comment next to it in patch #1

> Was it, so that each time you extend an uapi structure like above that you
> would
> only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not
> work for
> old binaries using this ABI running on newer kernels where there are
> different
> expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of
> compilation.

exactly the opposite.
CHECK_ATTR() is checking that all fields beyond last for given
command are zero, so we can extend bpf_attr with new fields
added after last.
Transition from patch 4 to patch 7 and the hunk you quoted are
demonstrating exactly that. Say, userspace was compiled
with bpf_attr as defined in patch 4 and it populated fields all the way
till 'license', and kernel is compiled with patch 7. Kernel does:
union bpf_attr attr = {};
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
copy_from_user(&attr, uattr, size)
so newer fields (all the way till log_buf) stay zero and kernel
behavior is the same as it was in patch 4.
So older user space works as-is with newer kernel.
Another example:
say, we want to add another flag to lookup() method added in patch 3,
we just add another 'flags' field after 'value' field and adjust:
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
Older user apps stay binary compatible with newer kernel.
Does this explain things?

>> +
>> +       /* grab the mutex to protect few globals used by verifier */
>> +       mutex_lock(&bpf_verifier_lock);
>
>
> So only because of the verifier error log (which are global vars here) we
> now have to hold a eBPF-related mutex lock each time when attaching a
> program?

correct.
it's done on purpose to simplify verifier code.
User app is blocked in bpf syscall until verifier checks the program.
Not a big deal. I don't expect a lot of concurrent program loading.
If it somehow becomes an issue, when can fix it, but for now I think
less lines of verifier code is definitely a better trade off.

> Also, if you really have to do the verifier error log, can't we spare
> ourself
> most part of the textifying parts if you would encode the verifier log into
> a
> normal structure array with eBPF specific error codes and then do all this
> pretty printing in user space? Why is that impossible? I really think it's
> odd.

I thought I explained this already...
verifier log is not at all "an array of specific error codes".
verifier is printing the trace and state of what it's seeing
while analyzing the program. Very branchy program may
generate a trace log several times larger than program size.
pretty text is the most convenient way to pass it to user.
Theoretically we can come with some obscure log format for
this internal verifier state, but what do we get ? only additional
complexity. This obscure format will change just as text will
change, because verifier will evolve. If you're looking for a way
to fix this output into ABI, it's not possible. Verifier will
become more advanced in the future and it's trace whether
in text or in obscure encoded structs will change.
Therefore text is much simpler option.
Also consider the learning curve for somebody planning
to add new features to verifier. This trace log is a perfect way
to understand how verifier is working. Try simple program
with multiple branches and see what kind of log is dumped.

Another example:
To make verifier easier to review, in this patch set I didn't
include 'state pruning optimization' patch. That patch
will change the trace log, because it changes the way
verifier is working. If we had to introduce struct
encoding of trace log, it would need to be changed already.
So pretty text is the simplest and most convenient way.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 17, 2014, 5:03 p.m. UTC | #3
On 09/17/2014 06:08 PM, Alexei Starovoitov wrote:
> On Tue, Sep 16, 2014 at 11:51 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
>>>
>>>    /* last field in 'union bpf_attr' used by this command */
>>> -#define        BPF_PROG_LOAD_LAST_FIELD license
>>> +#define        BPF_PROG_LOAD_LAST_FIELD log_buf
>>
>> I was looking to find a use case for this item, but couldn't find anything,
>> so
>> this seems to be dead code?
>
> See CHECK_ATTR() macro and comment next to it in patch #1
>
>> Was it, so that each time you extend an uapi structure like above that you
>> would
>> only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not
>> work for
>> old binaries using this ABI running on newer kernels where there are
>> different
>> expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of
>> compilation.
>
> exactly the opposite.
> CHECK_ATTR() is checking that all fields beyond last for given
> command are zero, so we can extend bpf_attr with new fields
> added after last.
> Transition from patch 4 to patch 7 and the hunk you quoted are
> demonstrating exactly that. Say, userspace was compiled
> with bpf_attr as defined in patch 4 and it populated fields all the way
> till 'license', and kernel is compiled with patch 7. Kernel does:
> union bpf_attr attr = {};
> /* copy attributes from user space, may be less than sizeof(bpf_attr) */
> copy_from_user(&attr, uattr, size)
> so newer fields (all the way till log_buf) stay zero and kernel
> behavior is the same as it was in patch 4.
> So older user space works as-is with newer kernel.

Ok, I see. Lets say, since the introduction of this syscall you have
added a couple of features and thus extended union bpf_attr where it
grew in size over time.

You built your shiny new binary on that uapi setting, and later on
decide to run it on an older kernel. What will happen is that in your
bpf syscall handler you will return with -EINVAL on that kernel right
away since the size of union bpf_attr is greater.

That would mean over time when new features will get added, applications
that want to make sure to run on _all_ kernels where the bpf syscall is
available have to make sure to either use the _initial_ version of
union bpf_attr in order to not get an -EINVAL, or worse they have
to probe though a syscall different versions of union bpf_attr if they
wish to make use of a particular feature until they do not get an -EINVAL
anymore.

I guess that might be problematic for an application developer that
wants to ship its application across different distributions usually
running different kernels. At least those people might then consider
holding a private copy of the _initial_ version of union bpf_attr in
their own source code, but it's not pleasant I guess.

I know you seem to have the constraint to run on NET-less systems, but
netlink could partially resolve that in the sense that it would allow
to at least load an eBPF program with initial feature set. Couldn't
there be some mechanism to make this interaction more pleasant? E.g.
in BPF extensions we can ask the kernel up to what extension it
supports and accordingly adapt to it up front. I know it's just a
/trivial/ example but have you thought about something on that kind for
the syscall?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 17, 2014, 7:17 p.m. UTC | #4
On 09/17/2014 07:03 PM, Daniel Borkmann wrote:
> On 09/17/2014 06:08 PM, Alexei Starovoitov wrote:
>> On Tue, Sep 16, 2014 at 11:51 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
>>>>
>>>>    /* last field in 'union bpf_attr' used by this command */
>>>> -#define        BPF_PROG_LOAD_LAST_FIELD license
>>>> +#define        BPF_PROG_LOAD_LAST_FIELD log_buf
>>>
>>> I was looking to find a use case for this item, but couldn't find anything,
>>> so
>>> this seems to be dead code?
>>
>> See CHECK_ATTR() macro and comment next to it in patch #1
>>
>>> Was it, so that each time you extend an uapi structure like above that you
>>> would
>>> only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not
>>> work for
>>> old binaries using this ABI running on newer kernels where there are
>>> different
>>> expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of
>>> compilation.
>>
>> exactly the opposite.
>> CHECK_ATTR() is checking that all fields beyond last for given
>> command are zero, so we can extend bpf_attr with new fields
>> added after last.
>> Transition from patch 4 to patch 7 and the hunk you quoted are
>> demonstrating exactly that. Say, userspace was compiled
>> with bpf_attr as defined in patch 4 and it populated fields all the way
>> till 'license', and kernel is compiled with patch 7. Kernel does:
>> union bpf_attr attr = {};
>> /* copy attributes from user space, may be less than sizeof(bpf_attr) */
>> copy_from_user(&attr, uattr, size)
>> so newer fields (all the way till log_buf) stay zero and kernel
>> behavior is the same as it was in patch 4.
>> So older user space works as-is with newer kernel.
>
> Ok, I see. Lets say, since the introduction of this syscall you have
> added a couple of features and thus extended union bpf_attr where it
> grew in size over time.
>
> You built your shiny new binary on that uapi setting, and later on
> decide to run it on an older kernel. What will happen is that in your
> bpf syscall handler you will return with -EINVAL on that kernel right
> away since the size of union bpf_attr is greater.
>
> That would mean over time when new features will get added, applications
> that want to make sure to run on _all_ kernels where the bpf syscall is
> available have to make sure to either use the _initial_ version of
> union bpf_attr in order to not get an -EINVAL, or worse they have
> to probe though a syscall different versions of union bpf_attr if they
> wish to make use of a particular feature until they do not get an -EINVAL
> anymore.
>
> I guess that might be problematic for an application developer that
> wants to ship its application across different distributions usually
> running different kernels. At least those people might then consider
> holding a private copy of the _initial_ version of union bpf_attr in
> their own source code, but it's not pleasant I guess.
>
> I know you seem to have the constraint to run on NET-less systems, but
> netlink could partially resolve that in the sense that it would allow
> to at least load an eBPF program with initial feature set. Couldn't
> there be some mechanism to make this interaction more pleasant? E.g.
> in BPF extensions we can ask the kernel up to what extension it
> supports and accordingly adapt to it up front. I know it's just a
> /trivial/ example but have you thought about something on that kind for
> the syscall?

Hm, thinking out loudly ... perhaps this could be made a library problem.
Such that the library which wraps the syscall needs to be aware of a
marker where the initial version ends, and if the application doesn't
make use of any of the new features, it would just pass in the length up
to the marker as size attribute into the syscall. Similarly, if new
features are always added to the end of a structure and the library
truncates the overall-length after the last used member we might have
a chance to load something on older kernels, haven't tried that though.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 17, 2014, 7:37 p.m. UTC | #5
On Wed, Sep 17, 2014 at 12:17 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 09/17/2014 07:03 PM, Daniel Borkmann wrote:
>>
>> You built your shiny new binary on that uapi setting, and later on
>> decide to run it on an older kernel. What will happen is that in your
>> bpf syscall handler you will return with -EINVAL on that kernel right
>> away since the size of union bpf_attr is greater.

that's a correct description of the code in this set.
What I mentioned in cover letter during v7 was:
"I've decided not to bother with forward compatiblity for now.
We can address it later the way perf_event_open did."
I was trying not open this can of worms :)

>> That would mean over time when new features will get added, applications
>> that want to make sure to run on _all_ kernels where the bpf syscall is
>> available have to make sure to either use the _initial_ version of
>> union bpf_attr in order to not get an -EINVAL, or worse they have
>> to probe though a syscall different versions of union bpf_attr if they
>> wish to make use of a particular feature until they do not get an -EINVAL
>> anymore.

Correct as well.
I say that would be the least of user space concerns.
If user app is built with newer bpf_attr and relies on it for some
functionality, it would need a lot more than probing.
Depending how big the new bpf_attr feature is, the app might
not have a workaround for older kernels at all.
So seeing EINVAL on older kernel might be a preferred way.

>> in BPF extensions we can ask the kernel up to what extension it
>> supports and accordingly adapt to it up front. I know it's just a
>> /trivial/ example but have you thought about something on that kind for
>> the syscall?

A 2nd option would be to do what perf_copy_attr() does:
when newer struct is coming from user space, kernel checks
that space beyond known last field is all zeros.
We can do the same, but I'm not convinced that it will
help userspace much. Newer user space can only be
_compiled_ with newer bpf_attr. It is still cannot use any new
features and it needs to make sure that all new fields are zero.
I'm not sure how it helps.
perf_event_open() also returns size. In that case it's helpful,
since it's one structure. In ebpf case we have multiple
structures under one union, so returning size of the whole
bpf_attr doesn't help one particular command.

> Hm, thinking out loudly ... perhaps this could be made a library problem.
> Such that the library which wraps the syscall needs to be aware of a
> marker where the initial version ends, and if the application doesn't
> make use of any of the new features, it would just pass in the length up
> to the marker as size attribute into the syscall. Similarly, if new
> features are always added to the end of a structure and the library
> truncates the overall-length after the last used member we might have
> a chance to load something on older kernels, haven't tried that though.

that's a 3rd option. I think it's cleaner than 2nd, since it solves it
completely from user space.
It can even be smarter than that. If this syscall wrapper library
sees that newer features are used and it can workaround them:
it can chop size and pass older fields into the older kernel
and when it returns, do a workaround based on newer fields.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 17, 2014, 11:45 p.m. UTC | #6
On Wed, Sep 17, 2014 at 12:37 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>
>> Hm, thinking out loudly ... perhaps this could be made a library problem.
>> Such that the library which wraps the syscall needs to be aware of a
>> marker where the initial version ends, and if the application doesn't
>> make use of any of the new features, it would just pass in the length up
>> to the marker as size attribute into the syscall. Similarly, if new
>> features are always added to the end of a structure and the library
>> truncates the overall-length after the last used member we might have
>> a chance to load something on older kernels, haven't tried that though.
>
> that's a 3rd option. I think it's cleaner than 2nd, since it solves it
> completely from user space.
> It can even be smarter than that. If this syscall wrapper library
> sees that newer features are used and it can workaround them:
> it can chop size and pass older fields into the older kernel
> and when it returns, do a workaround based on newer fields.

the more I think about 'new user space + old kernel' problem,
the more certain I am that kernel should not try to help
user space, since most of the time it's not going to be enough,
but additional code in kernel would need to be maintained.

syscall commands and size of bpf_attr is the least of problems.
New map_type and prog_type will be added, new helper
functions will be available to programs.
One would think that md5 of uapi/linux/bpf.h would be
enough to say that user app is compatible... In reality,
it's not. The 'state pruning' verifier optimization I've talked
about will not change a single bit in bpf.h, but it will be
able to recognize more programs as safe.
A program developed on a new kernel with more
advanced verifier will load just fine on new kernel, but
this valid program will not load on old kernel, only because
verifier is not smart enough. Now we would need a version
of verifier exposed all the way to user space?!
imo that's too much. I think for eBPF infra kernel
should only guarantee backwards compatibility
(old user space must work with new kernel) and that's it.
That's what this patch is trying to do.
Thoughts?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 18, 2014, 6:44 a.m. UTC | #7
On 09/18/2014 01:45 AM, Alexei Starovoitov wrote:
> On Wed, Sep 17, 2014 at 12:37 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>>
>>> Hm, thinking out loudly ... perhaps this could be made a library problem.
>>> Such that the library which wraps the syscall needs to be aware of a
>>> marker where the initial version ends, and if the application doesn't
>>> make use of any of the new features, it would just pass in the length up
>>> to the marker as size attribute into the syscall. Similarly, if new
>>> features are always added to the end of a structure and the library
>>> truncates the overall-length after the last used member we might have
>>> a chance to load something on older kernels, haven't tried that though.
>>
>> that's a 3rd option. I think it's cleaner than 2nd, since it solves it
>> completely from user space.
>> It can even be smarter than that. If this syscall wrapper library
>> sees that newer features are used and it can workaround them:
>> it can chop size and pass older fields into the older kernel
>> and when it returns, do a workaround based on newer fields.
>
> the more I think about 'new user space + old kernel' problem,
> the more certain I am that kernel should not try to help
> user space, since most of the time it's not going to be enough,
> but additional code in kernel would need to be maintained.
>
> syscall commands and size of bpf_attr is the least of problems.
> New map_type and prog_type will be added, new helper
> functions will be available to programs.
> One would think that md5 of uapi/linux/bpf.h would be
> enough to say that user app is compatible... In reality,
> it's not. The 'state pruning' verifier optimization I've talked
> about will not change a single bit in bpf.h, but it will be
> able to recognize more programs as safe.
> A program developed on a new kernel with more
> advanced verifier will load just fine on new kernel, but
> this valid program will not load on old kernel, only because
> verifier is not smart enough. Now we would need a version
> of verifier exposed all the way to user space?!
> imo that's too much. I think for eBPF infra kernel
> should only guarantee backwards compatibility
> (old user space must work with new kernel) and that's it.
> That's what this patch is trying to do.
> Thoughts?

Sure, you will never get a full compatibility on that regard
while backwards compatibility needs to be guaranteed on the
other hand. I looked at perf_copy_attr() implementation and I
think that we should mimic it in a very similar way as it
exactly solves what we need.

For example, it will return with -EINVAL for (size > PAGE_SIZE)
and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
as an arbitrary hard upper limit where it is believed that it will
never grow beyond that large limit in future.

So this is a more loose constraint than what we currently do,
that is, -EINVAL on (size > sizeof(attr)) where attr is the
currently known size of a specific kernel. That would at least
be a start, you won't be able to cover everything though, but
it would allow to address the issue raised when running with
a basic feature set.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 18, 2014, 2:34 p.m. UTC | #8
On Wed, Sep 17, 2014 at 11:44 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 09/18/2014 01:45 AM, Alexei Starovoitov wrote:
>>
>> On Wed, Sep 17, 2014 at 12:37 PM, Alexei Starovoitov <ast@plumgrid.com>
>> wrote:
>>>
>>>
>>>> Hm, thinking out loudly ... perhaps this could be made a library
>>>> problem.
>>>> Such that the library which wraps the syscall needs to be aware of a
>>>> marker where the initial version ends, and if the application doesn't
>>>> make use of any of the new features, it would just pass in the length up
>>>> to the marker as size attribute into the syscall. Similarly, if new
>>>> features are always added to the end of a structure and the library
>>>> truncates the overall-length after the last used member we might have
>>>> a chance to load something on older kernels, haven't tried that though.
>>>
>>>
>>> that's a 3rd option. I think it's cleaner than 2nd, since it solves it
>>> completely from user space.
>>> It can even be smarter than that. If this syscall wrapper library
>>> sees that newer features are used and it can workaround them:
>>> it can chop size and pass older fields into the older kernel
>>> and when it returns, do a workaround based on newer fields.
>>
>>
>> the more I think about 'new user space + old kernel' problem,
>> the more certain I am that kernel should not try to help
>> user space, since most of the time it's not going to be enough,
>> but additional code in kernel would need to be maintained.
>>
>> syscall commands and size of bpf_attr is the least of problems.
>> New map_type and prog_type will be added, new helper
>> functions will be available to programs.
>> One would think that md5 of uapi/linux/bpf.h would be
>> enough to say that user app is compatible... In reality,
>> it's not. The 'state pruning' verifier optimization I've talked
>> about will not change a single bit in bpf.h, but it will be
>> able to recognize more programs as safe.
>> A program developed on a new kernel with more
>> advanced verifier will load just fine on new kernel, but
>> this valid program will not load on old kernel, only because
>> verifier is not smart enough. Now we would need a version
>> of verifier exposed all the way to user space?!
>> imo that's too much. I think for eBPF infra kernel
>> should only guarantee backwards compatibility
>> (old user space must work with new kernel) and that's it.
>> That's what this patch is trying to do.
>> Thoughts?
>
>
> Sure, you will never get a full compatibility on that regard
> while backwards compatibility needs to be guaranteed on the
> other hand. I looked at perf_copy_attr() implementation and I
> think that we should mimic it in a very similar way as it
> exactly solves what we need.
>
> For example, it will return with -EINVAL for (size > PAGE_SIZE)
> and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
> as an arbitrary hard upper limit where it is believed that it will
> never grow beyond that large limit in future.
>
> So this is a more loose constraint than what we currently do,
> that is, -EINVAL on (size > sizeof(attr)) where attr is the
> currently known size of a specific kernel. That would at least
> be a start, you won't be able to cover everything though, but
> it would allow to address the issue raised when running with
> a basic feature set.

you missed my point. We should not 'do a start', since it
doesn't help user space in the long run and only makes
kernel more complex.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 18, 2014, 2:50 p.m. UTC | #9
On 09/18/2014 04:34 PM, Alexei Starovoitov wrote:
> On Wed, Sep 17, 2014 at 11:44 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
...
>> Sure, you will never get a full compatibility on that regard
>> while backwards compatibility needs to be guaranteed on the
>> other hand. I looked at perf_copy_attr() implementation and I
>> think that we should mimic it in a very similar way as it
>> exactly solves what we need.
>>
>> For example, it will return with -EINVAL for (size > PAGE_SIZE)
>> and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
>> as an arbitrary hard upper limit where it is believed that it will
>> never grow beyond that large limit in future.
>>
>> So this is a more loose constraint than what we currently do,
>> that is, -EINVAL on (size > sizeof(attr)) where attr is the
>> currently known size of a specific kernel. That would at least
>> be a start, you won't be able to cover everything though, but
>> it would allow to address the issue raised when running with
>> a basic feature set.
>
> you missed my point. We should not 'do a start', since it
> doesn't help user space in the long run and only makes
> kernel more complex.

Sorry, I don't think I missed your point. But if you see things
differently, fair enough, it was just a suggestion.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Sept. 18, 2014, 3:24 p.m. UTC | #10
On Thu, Sep 18, 2014 at 7:50 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 09/18/2014 04:34 PM, Alexei Starovoitov wrote:
>>
>> On Wed, Sep 17, 2014 at 11:44 PM, Daniel Borkmann <dborkman@redhat.com>
>> wrote:
>
> ...
>>>
>>> Sure, you will never get a full compatibility on that regard
>>> while backwards compatibility needs to be guaranteed on the
>>> other hand. I looked at perf_copy_attr() implementation and I
>>> think that we should mimic it in a very similar way as it
>>> exactly solves what we need.
>>>
>>> For example, it will return with -EINVAL for (size > PAGE_SIZE)
>>> and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
>>> as an arbitrary hard upper limit where it is believed that it will
>>> never grow beyond that large limit in future.
>>>
>>> So this is a more loose constraint than what we currently do,
>>> that is, -EINVAL on (size > sizeof(attr)) where attr is the
>>> currently known size of a specific kernel. That would at least
>>> be a start, you won't be able to cover everything though, but
>>> it would allow to address the issue raised when running with
>>> a basic feature set.
>>
>>
>> you missed my point. We should not 'do a start', since it
>> doesn't help user space in the long run and only makes
>> kernel more complex.
>
>
> Sorry, I don't think I missed your point. But if you see things
> differently, fair enough, it was just a suggestion.

now you probably think I'm shutting you up. Sorry. Was not
my intention. Let me rephrase what I meant:
I think we should decide right now whether
'new user + old kernel' is really a problem we're going to
solve or not. If we decide to solve it, we need to have
a plan to solve it all the way. Partial fix for size of bpf_attr
is not a plan. It's something that is not addressing the problem
completely. Little bit of help is not useful for userspace. It
would need to deal with new types, verifier differences and
other things that I mentioned earlier. So we either decide
that we're going to spend time to solve all of them (not
necessarily today, but over long haul) or we're not doing
any of it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 18, 2014, 5:28 p.m. UTC | #11
On 09/18/2014 05:24 PM, Alexei Starovoitov wrote:
...
> solve or not. If we decide to solve it, we need to have
> a plan to solve it all the way. Partial fix for size of bpf_attr
> is not a plan. It's something that is not addressing the problem
> completely. Little bit of help is not useful for userspace. It
> would need to deal with new types, verifier differences and
> other things that I mentioned earlier.

Hm, I don't think it would be a strict requirement to solve it
all the way, and I think that perf_event_open() with perf_copy_attr()
is not trying to do so either. It, however, is trying on a ``best
effort basis'' to still load something if new features are unused
by the binary (I guess you saw the comment in perf_copy_attr()).

Iff, e.g. due to new types we fail at the verifier stage, sure,
that's life since we only have backwards-compatible guarantee,
but in case we tried to use features we support, we're still able
to load the eBPF program while right now, we're rejecting it right
up-front. That's just my $0.02 ...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 424f442016e7..31b0ac208a52 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -138,6 +138,9 @@  union bpf_attr {
 		__u32		insn_cnt;
 		__aligned_u64	insns;
 		__aligned_u64	license;
+		__u32		log_level;	/* verbosity level of verifier */
+		__u32		log_size;	/* size of user buffer */
+		__aligned_u64	log_buf;	/* user supplied buffer */
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 67b5e29f183e..c7be7163bd11 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -458,7 +458,7 @@  struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD license
+#define	BPF_PROG_LOAD_LAST_FIELD log_buf
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6f9c3d6b4d7..871edc1f2e1f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,9 +125,244 @@ 
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+	va_list args;
+
+	if (log_level == 0 || log_len >= log_size - 1)
+		return;
+
+	va_start(args, fmt);
+	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	va_end(args);
+}
+
+static const char *const bpf_class_string[] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_SRC(insn->code) == BPF_X)
+			verbose("(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		else
+			verbose("(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose("BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM) {
+			verbose("(%02x) r%d = 0x%x\n",
+				insn->code, insn->dst_reg, insn->imm);
+		} else {
+			verbose("BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose("(%02x) call %d\n", insn->code, insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose("(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose("(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+	}
+}
+
 int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 {
+	char __user *log_ubuf = NULL;
+	struct verifier_env *env;
 	int ret = -EINVAL;
 
+	if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+		return -E2BIG;
+
+	/* 'struct verifier_env' can be global, but since it's not small,
+	 * allocate/free it every time bpf_check() is called
+	 */
+	env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+	if (!env)
+		return -ENOMEM;
+
+	/* grab the mutex to protect few globals used by verifier */
+	mutex_lock(&bpf_verifier_lock);
+
+	if (attr->log_level || attr->log_buf || attr->log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log_level = attr->log_level;
+		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log_size = attr->log_size;
+		log_len = 0;
+
+		ret = -EINVAL;
+		/* log_* values have to be sane */
+		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+		    log_level == 0 || log_ubuf == NULL)
+			goto free_env;
+
+		ret = -ENOMEM;
+		log_buf = vmalloc(log_size);
+		if (!log_buf)
+			goto free_env;
+	} else {
+		log_level = 0;
+	}
+
+	/* ret = do_check(env); */
+
+	if (log_level && log_len >= log_size - 1) {
+		BUG_ON(log_len >= log_size);
+		/* verifier log exceeded user supplied buffer */
+		ret = -ENOSPC;
+		/* fall through to return what was recorded */
+	}
+
+	/* copy verifier log back to user space including trailing zero */
+	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+		ret = -EFAULT;
+		goto free_log_buf;
+	}
+
+
+free_log_buf:
+	if (log_level)
+		vfree(log_buf);
+free_env:
+	kfree(env);
+	mutex_unlock(&bpf_verifier_lock);
 	return ret;
 }