diff mbox

[v5,net-next,2/3,RFC] seccomp: convert seccomp to use extended BPF

Message ID 1393971437-4129-3-git-send-email-ast@plumgrid.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov March 4, 2014, 10:17 p.m. UTC
use sk_convert_filter() to convert seccomp BPF into extended BPF

05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
  seccomp_rule_add_exact(ctx,...
  seccomp_rule_add_exact(ctx,...
  rc = seccomp_load(ctx);
  for (i = 0; i < 10000000; i++)
     syscall(199, 100);

--x86_64--
old BPF: 8.6 seconds
    73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
    10.70%    bench  libc-2.15.so       [.] syscall
     5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
     1.97%    bench  [kernel.kallsyms]  [k] system_call
ext BPF: 5.7 seconds
    66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_ext
    16.75%    bench  libc-2.15.so       [.] syscall
     3.31%    bench  [kernel.kallsyms]  [k] system_call
     2.88%    bench  [kernel.kallsyms]  [k] __secure_computing

--i386---
old BPF: 5.4 sec
ext BPF: 3.8 sec

BPF filters generated by seccomp are very branchy, so ext BPF
performance is better than old BPF.

Performance gains will be even higher when extended BPF JIT
is committed.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>

---
This patch is an RFC to use extended BPF in seccomp.
Change it to do it conditionally with bpf_ext_enable knob ?
---
 include/linux/seccomp.h |    1 -
 kernel/seccomp.c        |  112 +++++++++++++++++++++--------------------------
 net/core/filter.c       |    5 ---
 3 files changed, 51 insertions(+), 67 deletions(-)

Comments

Alexei Starovoitov March 5, 2014, 3:11 a.m. UTC | #1
On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> use sk_convert_filter() to convert seccomp BPF into extended BPF
>
> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
>   seccomp_rule_add_exact(ctx,...
>   seccomp_rule_add_exact(ctx,...
>   rc = seccomp_load(ctx);
>   for (i = 0; i < 10000000; i++)
>      syscall(199, 100);
>
> --x86_64--
> old BPF: 8.6 seconds
>     73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
>     10.70%    bench  libc-2.15.so       [.] syscall
>      5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
>      1.97%    bench  [kernel.kallsyms]  [k] system_call
> ext BPF: 5.7 seconds
>     66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_ext
>     16.75%    bench  libc-2.15.so       [.] syscall
>      3.31%    bench  [kernel.kallsyms]  [k] system_call
>      2.88%    bench  [kernel.kallsyms]  [k] __secure_computing
>
> --i386---
> old BPF: 5.4 sec
> ext BPF: 3.8 sec
>
> BPF filters generated by seccomp are very branchy, so ext BPF
> performance is better than old BPF.
>
> Performance gains will be even higher when extended BPF JIT
> is committed.
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
>
> ---
> This patch is an RFC to use extended BPF in seccomp.
> Change it to do it conditionally with bpf_ext_enable knob ?

Kees, Will,

I've played with libseccomp to test this patch and just found
your other testsuite for bpf+seccomp:
https://github.com/redpig/seccomp
It passes as well on x86_64 and i386.

Not sure how real all 'seccomp' and libseccomp' bpf filters.
Some of them are very short, some very long.
If average number of filtered syscalls is > 10, then upcoming
'ebpf table' will give another performance boost, since single table
lookup will be faster than long sequence of 'if' conditions.

Please review.

Thanks
Alexei

> ---
>  include/linux/seccomp.h |    1 -
>  kernel/seccomp.c        |  112 +++++++++++++++++++++--------------------------
>  net/core/filter.c       |    5 ---
>  3 files changed, 51 insertions(+), 67 deletions(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 6f19cfd1840e..4054b0994071 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
>  #ifdef CONFIG_SECCOMP_FILTER
>  extern void put_seccomp_filter(struct task_struct *tsk);
>  extern void get_seccomp_filter(struct task_struct *tsk);
> -extern u32 seccomp_bpf_load(int off);
>  #else  /* CONFIG_SECCOMP_FILTER */
>  static inline void put_seccomp_filter(struct task_struct *tsk)
>  {
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index b7a10048a32c..346c597b44cc 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -55,60 +55,27 @@ struct seccomp_filter {
>         atomic_t usage;
>         struct seccomp_filter *prev;
>         unsigned short len;  /* Instruction count */
> -       struct sock_filter insns[];
> +       struct sock_filter_ext insns[];
>  };
>
>  /* Limit any path through the tree to 256KB worth of instructions. */
>  #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>
> -/**
> - * get_u32 - returns a u32 offset into data
> - * @data: a unsigned 64 bit value
> - * @index: 0 or 1 to return the first or second 32-bits
> - *
> - * This inline exists to hide the length of unsigned long.  If a 32-bit
> - * unsigned long is passed in, it will be extended and the top 32-bits will be
> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
> - * properly returned.
> - *
> +/*
>   * Endianness is explicitly ignored and left for BPF program authors to manage
>   * as per the specific architecture.
>   */
> -static inline u32 get_u32(u64 data, int index)
> -{
> -       return ((u32 *)&data)[index];
> -}
> -
> -/* Helper for bpf_load below. */
> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
> -/**
> - * bpf_load: checks and returns a pointer to the requested offset
> - * @off: offset into struct seccomp_data to load from
> - *
> - * Returns the requested 32-bits of data.
> - * seccomp_check_filter() should assure that @off is 32-bit aligned
> - * and not out of bounds.  Failure to do so is a BUG.
> - */
> -u32 seccomp_bpf_load(int off)
> +static void populate_seccomp_data(struct seccomp_data *sd)
>  {
>         struct pt_regs *regs = task_pt_regs(current);
> -       if (off == BPF_DATA(nr))
> -               return syscall_get_nr(current, regs);
> -       if (off == BPF_DATA(arch))
> -               return syscall_get_arch(current, regs);
> -       if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
> -               unsigned long value;
> -               int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
> -               int index = !!(off % sizeof(u64));
> -               syscall_get_arguments(current, regs, arg, 1, &value);
> -               return get_u32(value, index);
> -       }
> -       if (off == BPF_DATA(instruction_pointer))
> -               return get_u32(KSTK_EIP(current), 0);
> -       if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
> -               return get_u32(KSTK_EIP(current), 1);
> -       /* seccomp_check_filter should make this impossible. */
> -       BUG();
> +       int i;
> +
> +       sd->nr = syscall_get_nr(current, regs);
> +       sd->arch = syscall_get_arch(current, regs);
> +       for (i = 0; i < 6; i++)
> +               syscall_get_arguments(current, regs, i, 1,
> +                                     (unsigned long *)&sd->args[i]);
> +       sd->instruction_pointer = KSTK_EIP(current);
>  }
>
>  /**
> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>
>                 switch (code) {
>                 case BPF_S_LD_W_ABS:
> -                       ftest->code = BPF_S_ANC_SECCOMP_LD_W;
> +                       ftest->code = BPF_LDX | BPF_W | BPF_ABS;
>                         /* 32-bit aligned and not out of bounds. */
>                         if (k >= sizeof(struct seccomp_data) || k & 3)
>                                 return -EINVAL;
>                         continue;
>                 case BPF_S_LD_W_LEN:
> -                       ftest->code = BPF_S_LD_IMM;
> +                       ftest->code = BPF_LD | BPF_IMM;
>                         ftest->k = sizeof(struct seccomp_data);
>                         continue;
>                 case BPF_S_LDX_W_LEN:
> -                       ftest->code = BPF_S_LDX_IMM;
> +                       ftest->code = BPF_LDX | BPF_IMM;
>                         ftest->k = sizeof(struct seccomp_data);
>                         continue;
>                 /* Explicitly include allowed calls. */
> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>                 case BPF_S_JMP_JGT_X:
>                 case BPF_S_JMP_JSET_K:
>                 case BPF_S_JMP_JSET_X:
> +                       sk_decode_filter(ftest, ftest);
>                         continue;
>                 default:
>                         return -EINVAL;
> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>  static u32 seccomp_run_filters(int syscall)
>  {
>         struct seccomp_filter *f;
> +       struct seccomp_data sd;
>         u32 ret = SECCOMP_RET_ALLOW;
>
>         /* Ensure unexpected behavior doesn't result in failing open. */
>         if (WARN_ON(current->seccomp.filter == NULL))
>                 return SECCOMP_RET_KILL;
>
> +       populate_seccomp_data(&sd);
> +
>         /*
>          * All filters in the list are evaluated and the lowest BPF return
>          * value always takes priority (ignoring the DATA).
>          */
>         for (f = current->seccomp.filter; f; f = f->prev) {
> -               u32 cur_ret = sk_run_filter(NULL, f->insns);
> +               u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
>                 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>                         ret = cur_ret;
>         }
> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>         struct seccomp_filter *filter;
>         unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
>         unsigned long total_insns = fprog->len;
> +       struct sock_filter *fp;
> +       int new_len;
>         long ret;
>
>         if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>                                      CAP_SYS_ADMIN) != 0)
>                 return -EACCES;
>
> -       /* Allocate a new seccomp_filter */
> -       filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
> -                        GFP_KERNEL|__GFP_NOWARN);
> -       if (!filter)
> +       fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
> +       if (!fp)
>                 return -ENOMEM;
> -       atomic_set(&filter->usage, 1);
> -       filter->len = fprog->len;
>
>         /* Copy the instructions from fprog. */
>         ret = -EFAULT;
> -       if (copy_from_user(filter->insns, fprog->filter, fp_size))
> -               goto fail;
> +       if (copy_from_user(fp, fprog->filter, fp_size))
> +               goto free_prog;
>
>         /* Check and rewrite the fprog via the skb checker */
> -       ret = sk_chk_filter(filter->insns, filter->len);
> +       ret = sk_chk_filter(fp, fprog->len);
>         if (ret)
> -               goto fail;
> +               goto free_prog;
>
>         /* Check and rewrite the fprog for seccomp use */
> -       ret = seccomp_check_filter(filter->insns, filter->len);
> +       ret = seccomp_check_filter(fp, fprog->len);
>         if (ret)
> -               goto fail;
> +               goto free_prog;
> +
> +       /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
> +       ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
> +       if (ret)
> +               goto free_prog;
> +
> +       /* Allocate a new seccomp_filter */
> +       filter = kzalloc(sizeof(struct seccomp_filter) +
> +                        sizeof(struct sock_filter_ext) * new_len,
> +                        GFP_KERNEL|__GFP_NOWARN);
> +       if (!filter)
> +               goto free_prog;
> +
> +       ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);
> +       if (ret)
> +               goto free_filter;
> +       atomic_set(&filter->usage, 1);
> +       filter->len = new_len;
>
>         /*
>          * If there is an existing filter, make it the prev and don't drop its
> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>         filter->prev = current->seccomp.filter;
>         current->seccomp.filter = filter;
>         return 0;
> -fail:
> +
> +free_filter:
>         kfree(filter);
> +free_prog:
> +       kfree(fp);
>         return ret;
>  }
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1494421486b7..f144a6a7d939 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -388,11 +388,6 @@ load_b:
>                                 A = 0;
>                         continue;
>                 }
> -#ifdef CONFIG_SECCOMP_FILTER
> -               case BPF_S_ANC_SECCOMP_LD_W:
> -                       A = seccomp_bpf_load(fentry->k);
> -                       continue;
> -#endif
>                 default:
>                         WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
>                                        fentry->code, fentry->jt,
> --
> 1.7.9.5
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kees Cook March 5, 2014, 9:42 p.m. UTC | #2
On Tue, Mar 4, 2014 at 7:11 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> use sk_convert_filter() to convert seccomp BPF into extended BPF
>>
>> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
>>   seccomp_rule_add_exact(ctx,...
>>   seccomp_rule_add_exact(ctx,...
>>   rc = seccomp_load(ctx);
>>   for (i = 0; i < 10000000; i++)
>>      syscall(199, 100);
>>
>> --x86_64--
>> old BPF: 8.6 seconds
>>     73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
>>     10.70%    bench  libc-2.15.so       [.] syscall
>>      5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
>>      1.97%    bench  [kernel.kallsyms]  [k] system_call
>> ext BPF: 5.7 seconds
>>     66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_ext
>>     16.75%    bench  libc-2.15.so       [.] syscall
>>      3.31%    bench  [kernel.kallsyms]  [k] system_call
>>      2.88%    bench  [kernel.kallsyms]  [k] __secure_computing
>>
>> --i386---
>> old BPF: 5.4 sec
>> ext BPF: 3.8 sec
>>
>> BPF filters generated by seccomp are very branchy, so ext BPF
>> performance is better than old BPF.
>>
>> Performance gains will be even higher when extended BPF JIT
>> is committed.

Very cool! Have you had a chance to compare on ARM too?

>> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
>>
>> ---
>> This patch is an RFC to use extended BPF in seccomp.
>> Change it to do it conditionally with bpf_ext_enable knob ?
>
> Kees, Will,
>
> I've played with libseccomp to test this patch and just found
> your other testsuite for bpf+seccomp:
> https://github.com/redpig/seccomp
> It passes as well on x86_64 and i386.

Great! If it's passing those tests, then things should be in good shape.

> Not sure how real all 'seccomp' and libseccomp' bpf filters.
> Some of them are very short, some very long.
> If average number of filtered syscalls is > 10, then upcoming
> 'ebpf table' will give another performance boost, since single table
> lookup will be faster than long sequence of 'if' conditions.

To take advantage of that, will seccomp need a new (prctl) interface
to pass in a seccomp ebpf?

> Please review.
>
> Thanks
> Alexei
>
>> ---
>>  include/linux/seccomp.h |    1 -
>>  kernel/seccomp.c        |  112 +++++++++++++++++++++--------------------------
>>  net/core/filter.c       |    5 ---
>>  3 files changed, 51 insertions(+), 67 deletions(-)
>>
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index 6f19cfd1840e..4054b0994071 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
>>  #ifdef CONFIG_SECCOMP_FILTER
>>  extern void put_seccomp_filter(struct task_struct *tsk);
>>  extern void get_seccomp_filter(struct task_struct *tsk);
>> -extern u32 seccomp_bpf_load(int off);
>>  #else  /* CONFIG_SECCOMP_FILTER */
>>  static inline void put_seccomp_filter(struct task_struct *tsk)
>>  {
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index b7a10048a32c..346c597b44cc 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -55,60 +55,27 @@ struct seccomp_filter {
>>         atomic_t usage;
>>         struct seccomp_filter *prev;
>>         unsigned short len;  /* Instruction count */
>> -       struct sock_filter insns[];
>> +       struct sock_filter_ext insns[];
>>  };
>>
>>  /* Limit any path through the tree to 256KB worth of instructions. */
>>  #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>>
>> -/**
>> - * get_u32 - returns a u32 offset into data
>> - * @data: a unsigned 64 bit value
>> - * @index: 0 or 1 to return the first or second 32-bits
>> - *
>> - * This inline exists to hide the length of unsigned long.  If a 32-bit
>> - * unsigned long is passed in, it will be extended and the top 32-bits will be
>> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
>> - * properly returned.
>> - *
>> +/*
>>   * Endianness is explicitly ignored and left for BPF program authors to manage
>>   * as per the specific architecture.
>>   */
>> -static inline u32 get_u32(u64 data, int index)
>> -{
>> -       return ((u32 *)&data)[index];
>> -}
>> -
>> -/* Helper for bpf_load below. */
>> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
>> -/**
>> - * bpf_load: checks and returns a pointer to the requested offset
>> - * @off: offset into struct seccomp_data to load from
>> - *
>> - * Returns the requested 32-bits of data.
>> - * seccomp_check_filter() should assure that @off is 32-bit aligned
>> - * and not out of bounds.  Failure to do so is a BUG.
>> - */
>> -u32 seccomp_bpf_load(int off)
>> +static void populate_seccomp_data(struct seccomp_data *sd)
>>  {
>>         struct pt_regs *regs = task_pt_regs(current);
>> -       if (off == BPF_DATA(nr))
>> -               return syscall_get_nr(current, regs);
>> -       if (off == BPF_DATA(arch))
>> -               return syscall_get_arch(current, regs);
>> -       if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
>> -               unsigned long value;
>> -               int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
>> -               int index = !!(off % sizeof(u64));
>> -               syscall_get_arguments(current, regs, arg, 1, &value);
>> -               return get_u32(value, index);
>> -       }
>> -       if (off == BPF_DATA(instruction_pointer))
>> -               return get_u32(KSTK_EIP(current), 0);
>> -       if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
>> -               return get_u32(KSTK_EIP(current), 1);
>> -       /* seccomp_check_filter should make this impossible. */
>> -       BUG();
>> +       int i;
>> +
>> +       sd->nr = syscall_get_nr(current, regs);
>> +       sd->arch = syscall_get_arch(current, regs);
>> +       for (i = 0; i < 6; i++)
>> +               syscall_get_arguments(current, regs, i, 1,
>> +                                     (unsigned long *)&sd->args[i]);
>> +       sd->instruction_pointer = KSTK_EIP(current);
>>  }
>>
>>  /**
>> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>
>>                 switch (code) {
>>                 case BPF_S_LD_W_ABS:
>> -                       ftest->code = BPF_S_ANC_SECCOMP_LD_W;
>> +                       ftest->code = BPF_LDX | BPF_W | BPF_ABS;

So, with this change, the removal of BPF_S_ANC_SECCOMP_LD_W, and the
unconditional use of populate_seccomp_data(), I'm surprised there
isn't a larger hit on small filters. Currently, seccomp only loads
what it needs into the filter (via BPF_S_ANC_SECCOMP_LD_W + offset via
seccomp_bpf_load). Your benchmarks seem to show that this isn't a
problem, though. Is the ebpf gain just that much larger than the
additional unconditional loads happening in populate_seccomp_data()?

>>                         /* 32-bit aligned and not out of bounds. */
>>                         if (k >= sizeof(struct seccomp_data) || k & 3)
>>                                 return -EINVAL;
>>                         continue;
>>                 case BPF_S_LD_W_LEN:
>> -                       ftest->code = BPF_S_LD_IMM;
>> +                       ftest->code = BPF_LD | BPF_IMM;
>>                         ftest->k = sizeof(struct seccomp_data);
>>                         continue;
>>                 case BPF_S_LDX_W_LEN:
>> -                       ftest->code = BPF_S_LDX_IMM;
>> +                       ftest->code = BPF_LDX | BPF_IMM;
>>                         ftest->k = sizeof(struct seccomp_data);
>>                         continue;
>>                 /* Explicitly include allowed calls. */
>> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>                 case BPF_S_JMP_JGT_X:
>>                 case BPF_S_JMP_JSET_K:
>>                 case BPF_S_JMP_JSET_X:
>> +                       sk_decode_filter(ftest, ftest);
>>                         continue;
>>                 default:
>>                         return -EINVAL;
>> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>  static u32 seccomp_run_filters(int syscall)
>>  {
>>         struct seccomp_filter *f;
>> +       struct seccomp_data sd;
>>         u32 ret = SECCOMP_RET_ALLOW;
>>
>>         /* Ensure unexpected behavior doesn't result in failing open. */
>>         if (WARN_ON(current->seccomp.filter == NULL))
>>                 return SECCOMP_RET_KILL;
>>
>> +       populate_seccomp_data(&sd);
>> +
>>         /*
>>          * All filters in the list are evaluated and the lowest BPF return
>>          * value always takes priority (ignoring the DATA).
>>          */
>>         for (f = current->seccomp.filter; f; f = f->prev) {
>> -               u32 cur_ret = sk_run_filter(NULL, f->insns);
>> +               u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
>>                 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>>                         ret = cur_ret;
>>         }
>> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>         struct seccomp_filter *filter;
>>         unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
>>         unsigned long total_insns = fprog->len;
>> +       struct sock_filter *fp;
>> +       int new_len;
>>         long ret;
>>
>>         if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
>> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>                                      CAP_SYS_ADMIN) != 0)
>>                 return -EACCES;
>>
>> -       /* Allocate a new seccomp_filter */
>> -       filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
>> -                        GFP_KERNEL|__GFP_NOWARN);
>> -       if (!filter)
>> +       fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
>> +       if (!fp)
>>                 return -ENOMEM;
>> -       atomic_set(&filter->usage, 1);
>> -       filter->len = fprog->len;
>>
>>         /* Copy the instructions from fprog. */
>>         ret = -EFAULT;
>> -       if (copy_from_user(filter->insns, fprog->filter, fp_size))
>> -               goto fail;
>> +       if (copy_from_user(fp, fprog->filter, fp_size))
>> +               goto free_prog;
>>
>>         /* Check and rewrite the fprog via the skb checker */
>> -       ret = sk_chk_filter(filter->insns, filter->len);
>> +       ret = sk_chk_filter(fp, fprog->len);
>>         if (ret)
>> -               goto fail;
>> +               goto free_prog;
>>
>>         /* Check and rewrite the fprog for seccomp use */
>> -       ret = seccomp_check_filter(filter->insns, filter->len);
>> +       ret = seccomp_check_filter(fp, fprog->len);
>>         if (ret)
>> -               goto fail;
>> +               goto free_prog;
>> +
>> +       /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
>> +       ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
>> +       if (ret)
>> +               goto free_prog;
>> +
>> +       /* Allocate a new seccomp_filter */
>> +       filter = kzalloc(sizeof(struct seccomp_filter) +
>> +                        sizeof(struct sock_filter_ext) * new_len,
>> +                        GFP_KERNEL|__GFP_NOWARN);
>> +       if (!filter)
>> +               goto free_prog;
>> +
>> +       ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);

Seems like it'd be more readable to set filter->len = new_len first
and use &filter->len as the last argument here. I would find that more
readable, but if nothing else needs changing in the series, this is
fine to leave as-is.

>> +       if (ret)
>> +               goto free_filter;
>> +       atomic_set(&filter->usage, 1);
>> +       filter->len = new_len;
>>
>>         /*
>>          * If there is an existing filter, make it the prev and don't drop its
>> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>         filter->prev = current->seccomp.filter;
>>         current->seccomp.filter = filter;
>>         return 0;
>> -fail:
>> +
>> +free_filter:
>>         kfree(filter);
>> +free_prog:
>> +       kfree(fp);
>>         return ret;
>>  }
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 1494421486b7..f144a6a7d939 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -388,11 +388,6 @@ load_b:
>>                                 A = 0;
>>                         continue;
>>                 }
>> -#ifdef CONFIG_SECCOMP_FILTER
>> -               case BPF_S_ANC_SECCOMP_LD_W:
>> -                       A = seccomp_bpf_load(fentry->k);
>> -                       continue;
>> -#endif
>>                 default:
>>                         WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
>>                                        fentry->code, fentry->jt,
>> --
>> 1.7.9.5
>>

Cool!

Reviewed-by: Kees Cook <keescook@chromium.org>

-Kees
Alexei Starovoitov March 6, 2014, 2 a.m. UTC | #3
On Wed, Mar 5, 2014 at 1:42 PM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, Mar 4, 2014 at 7:11 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>>> use sk_convert_filter() to convert seccomp BPF into extended BPF
>>>
>>> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
>>>   seccomp_rule_add_exact(ctx,...
>>>   seccomp_rule_add_exact(ctx,...
>>>   rc = seccomp_load(ctx);
>>>   for (i = 0; i < 10000000; i++)
>>>      syscall(199, 100);
>>>
>>> --x86_64--
>>> old BPF: 8.6 seconds
>>>     73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
>>>     10.70%    bench  libc-2.15.so       [.] syscall
>>>      5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
>>>      1.97%    bench  [kernel.kallsyms]  [k] system_call
>>> ext BPF: 5.7 seconds
>>>     66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_ext
>>>     16.75%    bench  libc-2.15.so       [.] syscall
>>>      3.31%    bench  [kernel.kallsyms]  [k] system_call
>>>      2.88%    bench  [kernel.kallsyms]  [k] __secure_computing
>>>
>>> --i386---
>>> old BPF: 5.4 sec
>>> ext BPF: 3.8 sec
>>>
>>> BPF filters generated by seccomp are very branchy, so ext BPF
>>> performance is better than old BPF.
>>>
>>> Performance gains will be even higher when extended BPF JIT
>>> is committed.
>
> Very cool! Have you had a chance to compare on ARM too?

tried arm and was surprised to see 7% regression. grr
turned out gcc wasn't unrolling this loop:
       for (i = 0; i < 6; i++)
               syscall_get_arguments(current, regs, i, 1, ...);
which was causing memcpy() of 4 bytes to be called in the hot path
for my micro-benchmark.
So have to manually unroll it.

Performance is the following:
--arm32-- short filter
old BPF: 4.0 sec
 39.92%  bench  [kernel.kallsyms]  [k] vector_swi
 16.60%  bench  [kernel.kallsyms]  [k] sk_run_filter
 14.66%  bench  libc-2.17.so       [.] syscall
  5.42%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  5.10%  bench  [kernel.kallsyms]  [k] __secure_computing
new BPF: 3.7 sec
 35.93%  bench  [kernel.kallsyms]  [k] vector_swi
 21.89%  bench  libc-2.17.so       [.] syscall
 13.45%  bench  [kernel.kallsyms]  [k] sk_run_filter_ext
  6.25%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.96%  bench  [kernel.kallsyms]  [k] syscall_trace_exit

--arm32-- large filter
old BPF: 13.5 sec
 73.88%  bench  [kernel.kallsyms]  [k] sk_run_filter
 10.29%  bench  [kernel.kallsyms]  [k] vector_swi
  6.46%  bench  libc-2.17.so       [.] syscall
  2.94%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  1.19%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.87%  bench  [kernel.kallsyms]  [k] sys_getuid
new BPF: 13.5 sec
 76.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_ext
 10.98%  bench  [kernel.kallsyms]  [k] vector_swi
  5.87%  bench  libc-2.17.so       [.] syscall
  1.77%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.93%  bench  [kernel.kallsyms]  [k] sys_getuid

So will resend the V6 for this patch set.

>
>>> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
>>>
>>> ---
>>> This patch is an RFC to use extended BPF in seccomp.
>>> Change it to do it conditionally with bpf_ext_enable knob ?
>>
>> Kees, Will,
>>
>> I've played with libseccomp to test this patch and just found
>> your other testsuite for bpf+seccomp:
>> https://github.com/redpig/seccomp
>> It passes as well on x86_64 and i386.
>
> Great! If it's passing those tests, then things should be in good shape.
>
>> Not sure how real all 'seccomp' and libseccomp' bpf filters.
>> Some of them are very short, some very long.
>> If average number of filtered syscalls is > 10, then upcoming
>> 'ebpf table' will give another performance boost, since single table
>> lookup will be faster than long sequence of 'if' conditions.
>
> To take advantage of that, will seccomp need a new (prctl) interface
> to pass in a seccomp ebpf?

I hope not.
I think it's better to have inband signaling for ext vs old program.
Like 'struct sock_fprog -> len' must be < 4096 for old programs.
if len == 4096, this can mean that this is extended bpf program in a new format.
This way prctl() for seccomp and socket attach mechanisms can stay the same.

>> Please review.
>>
>> Thanks
>> Alexei
>>
>>> ---
>>>  include/linux/seccomp.h |    1 -
>>>  kernel/seccomp.c        |  112 +++++++++++++++++++++--------------------------
>>>  net/core/filter.c       |    5 ---
>>>  3 files changed, 51 insertions(+), 67 deletions(-)
>>>
>>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>>> index 6f19cfd1840e..4054b0994071 100644
>>> --- a/include/linux/seccomp.h
>>> +++ b/include/linux/seccomp.h
>>> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
>>>  #ifdef CONFIG_SECCOMP_FILTER
>>>  extern void put_seccomp_filter(struct task_struct *tsk);
>>>  extern void get_seccomp_filter(struct task_struct *tsk);
>>> -extern u32 seccomp_bpf_load(int off);
>>>  #else  /* CONFIG_SECCOMP_FILTER */
>>>  static inline void put_seccomp_filter(struct task_struct *tsk)
>>>  {
>>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>> index b7a10048a32c..346c597b44cc 100644
>>> --- a/kernel/seccomp.c
>>> +++ b/kernel/seccomp.c
>>> @@ -55,60 +55,27 @@ struct seccomp_filter {
>>>         atomic_t usage;
>>>         struct seccomp_filter *prev;
>>>         unsigned short len;  /* Instruction count */
>>> -       struct sock_filter insns[];
>>> +       struct sock_filter_ext insns[];
>>>  };
>>>
>>>  /* Limit any path through the tree to 256KB worth of instructions. */
>>>  #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>>>
>>> -/**
>>> - * get_u32 - returns a u32 offset into data
>>> - * @data: a unsigned 64 bit value
>>> - * @index: 0 or 1 to return the first or second 32-bits
>>> - *
>>> - * This inline exists to hide the length of unsigned long.  If a 32-bit
>>> - * unsigned long is passed in, it will be extended and the top 32-bits will be
>>> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
>>> - * properly returned.
>>> - *
>>> +/*
>>>   * Endianness is explicitly ignored and left for BPF program authors to manage
>>>   * as per the specific architecture.
>>>   */
>>> -static inline u32 get_u32(u64 data, int index)
>>> -{
>>> -       return ((u32 *)&data)[index];
>>> -}
>>> -
>>> -/* Helper for bpf_load below. */
>>> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
>>> -/**
>>> - * bpf_load: checks and returns a pointer to the requested offset
>>> - * @off: offset into struct seccomp_data to load from
>>> - *
>>> - * Returns the requested 32-bits of data.
>>> - * seccomp_check_filter() should assure that @off is 32-bit aligned
>>> - * and not out of bounds.  Failure to do so is a BUG.
>>> - */
>>> -u32 seccomp_bpf_load(int off)
>>> +static void populate_seccomp_data(struct seccomp_data *sd)
>>>  {
>>>         struct pt_regs *regs = task_pt_regs(current);
>>> -       if (off == BPF_DATA(nr))
>>> -               return syscall_get_nr(current, regs);
>>> -       if (off == BPF_DATA(arch))
>>> -               return syscall_get_arch(current, regs);
>>> -       if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
>>> -               unsigned long value;
>>> -               int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
>>> -               int index = !!(off % sizeof(u64));
>>> -               syscall_get_arguments(current, regs, arg, 1, &value);
>>> -               return get_u32(value, index);
>>> -       }
>>> -       if (off == BPF_DATA(instruction_pointer))
>>> -               return get_u32(KSTK_EIP(current), 0);
>>> -       if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
>>> -               return get_u32(KSTK_EIP(current), 1);
>>> -       /* seccomp_check_filter should make this impossible. */
>>> -       BUG();
>>> +       int i;
>>> +
>>> +       sd->nr = syscall_get_nr(current, regs);
>>> +       sd->arch = syscall_get_arch(current, regs);
>>> +       for (i = 0; i < 6; i++)
>>> +               syscall_get_arguments(current, regs, i, 1,
>>> +                                     (unsigned long *)&sd->args[i]);
>>> +       sd->instruction_pointer = KSTK_EIP(current);
>>>  }
>>>
>>>  /**
>>> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>>
>>>                 switch (code) {
>>>                 case BPF_S_LD_W_ABS:
>>> -                       ftest->code = BPF_S_ANC_SECCOMP_LD_W;
>>> +                       ftest->code = BPF_LDX | BPF_W | BPF_ABS;
>
> So, with this change, the removal of BPF_S_ANC_SECCOMP_LD_W, and the
> unconditional use of populate_seccomp_data(), I'm surprised there
> isn't a larger hit on small filters. Currently, seccomp only loads
> what it needs into the filter (via BPF_S_ANC_SECCOMP_LD_W + offset via
> seccomp_bpf_load). Your benchmarks seem to show that this isn't a
> problem, though. Is the ebpf gain just that much larger than the
> additional unconditional loads happening in populate_seccomp_data()?

on arm there is a small gain for short programs.

for x86_64 there is also small gain:

--x86_64-- short filter
old BPF: 2.7 sec
 39.12%  bench  libc-2.15.so       [.] syscall
  8.10%  bench  [kernel.kallsyms]  [k] sk_run_filter
  6.31%  bench  [kernel.kallsyms]  [k] system_call
  5.59%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  4.37%  bench  [kernel.kallsyms]  [k] trace_hardirqs_off_caller
  3.70%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.67%  bench  [kernel.kallsyms]  [k] lock_is_held
  3.03%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
new BPF: 2.49 sec
 42.05%  bench  libc-2.15.so       [.] syscall
  6.91%  bench  [kernel.kallsyms]  [k] system_call
  6.25%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  6.07%  bench  [kernel.kallsyms]  [k] __secure_computing
  5.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_ext

obviously most of the time is spent in 'syscal' instruction in
syscall() function
but having populate_seccomp_data() is actually beneficial,
since 'current' and 'pt_regs' are hot in cache anyway,
so copying them into seccomp_data is cheaper than extra 'if' conditions
and extra calls from within the filter.
faster interpreter helps too.

>>>                         /* 32-bit aligned and not out of bounds. */
>>>                         if (k >= sizeof(struct seccomp_data) || k & 3)
>>>                                 return -EINVAL;
>>>                         continue;
>>>                 case BPF_S_LD_W_LEN:
>>> -                       ftest->code = BPF_S_LD_IMM;
>>> +                       ftest->code = BPF_LD | BPF_IMM;
>>>                         ftest->k = sizeof(struct seccomp_data);
>>>                         continue;
>>>                 case BPF_S_LDX_W_LEN:
>>> -                       ftest->code = BPF_S_LDX_IMM;
>>> +                       ftest->code = BPF_LDX | BPF_IMM;
>>>                         ftest->k = sizeof(struct seccomp_data);
>>>                         continue;
>>>                 /* Explicitly include allowed calls. */
>>> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>>                 case BPF_S_JMP_JGT_X:
>>>                 case BPF_S_JMP_JSET_K:
>>>                 case BPF_S_JMP_JSET_X:
>>> +                       sk_decode_filter(ftest, ftest);
>>>                         continue;
>>>                 default:
>>>                         return -EINVAL;
>>> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>>  static u32 seccomp_run_filters(int syscall)
>>>  {
>>>         struct seccomp_filter *f;
>>> +       struct seccomp_data sd;
>>>         u32 ret = SECCOMP_RET_ALLOW;
>>>
>>>         /* Ensure unexpected behavior doesn't result in failing open. */
>>>         if (WARN_ON(current->seccomp.filter == NULL))
>>>                 return SECCOMP_RET_KILL;
>>>
>>> +       populate_seccomp_data(&sd);
>>> +
>>>         /*
>>>          * All filters in the list are evaluated and the lowest BPF return
>>>          * value always takes priority (ignoring the DATA).
>>>          */
>>>         for (f = current->seccomp.filter; f; f = f->prev) {
>>> -               u32 cur_ret = sk_run_filter(NULL, f->insns);
>>> +               u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
>>>                 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>>>                         ret = cur_ret;
>>>         }
>>> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>>         struct seccomp_filter *filter;
>>>         unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
>>>         unsigned long total_insns = fprog->len;
>>> +       struct sock_filter *fp;
>>> +       int new_len;
>>>         long ret;
>>>
>>>         if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
>>> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>>                                      CAP_SYS_ADMIN) != 0)
>>>                 return -EACCES;
>>>
>>> -       /* Allocate a new seccomp_filter */
>>> -       filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
>>> -                        GFP_KERNEL|__GFP_NOWARN);
>>> -       if (!filter)
>>> +       fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
>>> +       if (!fp)
>>>                 return -ENOMEM;
>>> -       atomic_set(&filter->usage, 1);
>>> -       filter->len = fprog->len;
>>>
>>>         /* Copy the instructions from fprog. */
>>>         ret = -EFAULT;
>>> -       if (copy_from_user(filter->insns, fprog->filter, fp_size))
>>> -               goto fail;
>>> +       if (copy_from_user(fp, fprog->filter, fp_size))
>>> +               goto free_prog;
>>>
>>>         /* Check and rewrite the fprog via the skb checker */
>>> -       ret = sk_chk_filter(filter->insns, filter->len);
>>> +       ret = sk_chk_filter(fp, fprog->len);
>>>         if (ret)
>>> -               goto fail;
>>> +               goto free_prog;
>>>
>>>         /* Check and rewrite the fprog for seccomp use */
>>> -       ret = seccomp_check_filter(filter->insns, filter->len);
>>> +       ret = seccomp_check_filter(fp, fprog->len);
>>>         if (ret)
>>> -               goto fail;
>>> +               goto free_prog;
>>> +
>>> +       /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
>>> +       ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
>>> +       if (ret)
>>> +               goto free_prog;
>>> +
>>> +       /* Allocate a new seccomp_filter */
>>> +       filter = kzalloc(sizeof(struct seccomp_filter) +
>>> +                        sizeof(struct sock_filter_ext) * new_len,
>>> +                        GFP_KERNEL|__GFP_NOWARN);
>>> +       if (!filter)
>>> +               goto free_prog;
>>> +
>>> +       ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);
>
> Seems like it'd be more readable to set filter->len = new_len first
> and use &filter->len as the last argument here. I would find that more
> readable, but if nothing else needs changing in the series, this is
> fine to leave as-is.

cannot do that, since filter->len is 'short', but new_len and
sk_convert_filter()
expects 'int'

>>> +       if (ret)
>>> +               goto free_filter;
>>> +       atomic_set(&filter->usage, 1);
>>> +       filter->len = new_len;
>>>
>>>         /*
>>>          * If there is an existing filter, make it the prev and don't drop its
>>> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>>>         filter->prev = current->seccomp.filter;
>>>         current->seccomp.filter = filter;
>>>         return 0;
>>> -fail:
>>> +
>>> +free_filter:
>>>         kfree(filter);
>>> +free_prog:
>>> +       kfree(fp);
>>>         return ret;
>>>  }
>>>
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index 1494421486b7..f144a6a7d939 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -388,11 +388,6 @@ load_b:
>>>                                 A = 0;
>>>                         continue;
>>>                 }
>>> -#ifdef CONFIG_SECCOMP_FILTER
>>> -               case BPF_S_ANC_SECCOMP_LD_W:
>>> -                       A = seccomp_bpf_load(fentry->k);
>>> -                       continue;
>>> -#endif
>>>                 default:
>>>                         WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
>>>                                        fentry->code, fentry->jt,
>>> --
>>> 1.7.9.5
>>>
>
> Cool!
>
> Reviewed-by: Kees Cook <keescook@chromium.org>

Thank you for review!

Will send V6 with unrolled loop,
fix empty new line at the end of the file caught by Daniel
and update commit log with arm seccomp data.

Thanks!
Alexei
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 6f19cfd1840e..4054b0994071 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -76,7 +76,6 @@  static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
-extern u32 seccomp_bpf_load(int off);
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..346c597b44cc 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -55,60 +55,27 @@  struct seccomp_filter {
 	atomic_t usage;
 	struct seccomp_filter *prev;
 	unsigned short len;  /* Instruction count */
-	struct sock_filter insns[];
+	struct sock_filter_ext insns[];
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
 
-/**
- * get_u32 - returns a u32 offset into data
- * @data: a unsigned 64 bit value
- * @index: 0 or 1 to return the first or second 32-bits
- *
- * This inline exists to hide the length of unsigned long.  If a 32-bit
- * unsigned long is passed in, it will be extended and the top 32-bits will be
- * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
- * properly returned.
- *
+/*
  * Endianness is explicitly ignored and left for BPF program authors to manage
  * as per the specific architecture.
  */
-static inline u32 get_u32(u64 data, int index)
-{
-	return ((u32 *)&data)[index];
-}
-
-/* Helper for bpf_load below. */
-#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
-/**
- * bpf_load: checks and returns a pointer to the requested offset
- * @off: offset into struct seccomp_data to load from
- *
- * Returns the requested 32-bits of data.
- * seccomp_check_filter() should assure that @off is 32-bit aligned
- * and not out of bounds.  Failure to do so is a BUG.
- */
-u32 seccomp_bpf_load(int off)
+static void populate_seccomp_data(struct seccomp_data *sd)
 {
 	struct pt_regs *regs = task_pt_regs(current);
-	if (off == BPF_DATA(nr))
-		return syscall_get_nr(current, regs);
-	if (off == BPF_DATA(arch))
-		return syscall_get_arch(current, regs);
-	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
-		unsigned long value;
-		int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
-		int index = !!(off % sizeof(u64));
-		syscall_get_arguments(current, regs, arg, 1, &value);
-		return get_u32(value, index);
-	}
-	if (off == BPF_DATA(instruction_pointer))
-		return get_u32(KSTK_EIP(current), 0);
-	if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
-		return get_u32(KSTK_EIP(current), 1);
-	/* seccomp_check_filter should make this impossible. */
-	BUG();
+	int i;
+
+	sd->nr = syscall_get_nr(current, regs);
+	sd->arch = syscall_get_arch(current, regs);
+	for (i = 0; i < 6; i++)
+		syscall_get_arguments(current, regs, i, 1,
+				      (unsigned long *)&sd->args[i]);
+	sd->instruction_pointer = KSTK_EIP(current);
 }
 
 /**
@@ -133,17 +100,17 @@  static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 
 		switch (code) {
 		case BPF_S_LD_W_ABS:
-			ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
 			/* 32-bit aligned and not out of bounds. */
 			if (k >= sizeof(struct seccomp_data) || k & 3)
 				return -EINVAL;
 			continue;
 		case BPF_S_LD_W_LEN:
-			ftest->code = BPF_S_LD_IMM;
+			ftest->code = BPF_LD | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
 		case BPF_S_LDX_W_LEN:
-			ftest->code = BPF_S_LDX_IMM;
+			ftest->code = BPF_LDX | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
 		/* Explicitly include allowed calls. */
@@ -185,6 +152,7 @@  static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 		case BPF_S_JMP_JGT_X:
 		case BPF_S_JMP_JSET_K:
 		case BPF_S_JMP_JSET_X:
+			sk_decode_filter(ftest, ftest);
 			continue;
 		default:
 			return -EINVAL;
@@ -202,18 +170,21 @@  static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 static u32 seccomp_run_filters(int syscall)
 {
 	struct seccomp_filter *f;
+	struct seccomp_data sd;
 	u32 ret = SECCOMP_RET_ALLOW;
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (WARN_ON(current->seccomp.filter == NULL))
 		return SECCOMP_RET_KILL;
 
+	populate_seccomp_data(&sd);
+
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
 	for (f = current->seccomp.filter; f; f = f->prev) {
-		u32 cur_ret = sk_run_filter(NULL, f->insns);
+		u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
 	}
@@ -231,6 +202,8 @@  static long seccomp_attach_filter(struct sock_fprog *fprog)
 	struct seccomp_filter *filter;
 	unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
 	unsigned long total_insns = fprog->len;
+	struct sock_filter *fp;
+	int new_len;
 	long ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
@@ -252,28 +225,42 @@  static long seccomp_attach_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return -EACCES;
 
-	/* Allocate a new seccomp_filter */
-	filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
-			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter)
+	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
+	if (!fp)
 		return -ENOMEM;
-	atomic_set(&filter->usage, 1);
-	filter->len = fprog->len;
 
 	/* Copy the instructions from fprog. */
 	ret = -EFAULT;
-	if (copy_from_user(filter->insns, fprog->filter, fp_size))
-		goto fail;
+	if (copy_from_user(fp, fprog->filter, fp_size))
+		goto free_prog;
 
 	/* Check and rewrite the fprog via the skb checker */
-	ret = sk_chk_filter(filter->insns, filter->len);
+	ret = sk_chk_filter(fp, fprog->len);
 	if (ret)
-		goto fail;
+		goto free_prog;
 
 	/* Check and rewrite the fprog for seccomp use */
-	ret = seccomp_check_filter(filter->insns, filter->len);
+	ret = seccomp_check_filter(fp, fprog->len);
 	if (ret)
-		goto fail;
+		goto free_prog;
+
+	/* convert 'sock_filter' insns to 'sock_filter_ext' insns */
+	ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+	if (ret)
+		goto free_prog;
+
+	/* Allocate a new seccomp_filter */
+	filter = kzalloc(sizeof(struct seccomp_filter) +
+			 sizeof(struct sock_filter_ext) * new_len,
+			 GFP_KERNEL|__GFP_NOWARN);
+	if (!filter)
+		goto free_prog;
+
+	ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);
+	if (ret)
+		goto free_filter;
+	atomic_set(&filter->usage, 1);
+	filter->len = new_len;
 
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
@@ -282,8 +269,11 @@  static long seccomp_attach_filter(struct sock_fprog *fprog)
 	filter->prev = current->seccomp.filter;
 	current->seccomp.filter = filter;
 	return 0;
-fail:
+
+free_filter:
 	kfree(filter);
+free_prog:
+	kfree(fp);
 	return ret;
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1494421486b7..f144a6a7d939 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -388,11 +388,6 @@  load_b:
 				A = 0;
 			continue;
 		}
-#ifdef CONFIG_SECCOMP_FILTER
-		case BPF_S_ANC_SECCOMP_LD_W:
-			A = seccomp_bpf_load(fentry->k);
-			continue;
-#endif
 		default:
 			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
 				       fentry->code, fentry->jt,