[net-next,V2,3/3] tun: add eBPF based queue selection method

Message ID 1509445938-4345-4-git-send-email-jasowang@redhat.com
State Changes Requested
Delegated to: David Miller
Headers show
Series
  • support changing steering policies in tuntap
Related show

Commit Message

Jason Wang Oct. 31, 2017, 10:32 a.m.
This patch introduces an eBPF based queue selection method based on
the flow steering policy ops. Userspace could load an eBPF program
through TUNSETSTEERINGEBPF. This gives much more flexibility compare
to simple but hard coded policy in kernel.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c           | 79 ++++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/if_tun.h |  2 ++
 2 files changed, 80 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin Oct. 31, 2017, 4:45 p.m. | #1
On Tue, Oct 31, 2017 at 06:32:18PM +0800, Jason Wang wrote:
> This patch introduces an eBPF based queue selection method based on
> the flow steering policy ops. Userspace could load an eBPF program
> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> to simple but hard coded policy in kernel.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c           | 79 ++++++++++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/if_tun.h |  2 ++
>  2 files changed, 80 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ab109ff..4bdde21 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -191,6 +191,20 @@ struct tun_steering_ops {
>  			 u32 data);
>  };
>  
> +void tun_steering_xmit_nop(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +}
> +
> +u32 tun_steering_pre_rx_nop(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +	return 0;
> +}
> +
> +void tun_steering_post_rx_nop(struct tun_struct *tun, struct tun_file *tfile,
> +			      u32 data)
> +{
> +}
> +
>  struct tun_flow_entry {
>  	struct hlist_node hash_link;
>  	struct rcu_head rcu;
> @@ -241,6 +255,7 @@ struct tun_struct {
>  	u32 rx_batched;
>  	struct tun_pcpu_stats __percpu *pcpu_stats;
>  	struct bpf_prog __rcu *xdp_prog;
> +	struct bpf_prog __rcu *steering_prog;
>  	struct tun_steering_ops *steering_ops;
>  };
>  
> @@ -576,6 +591,19 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>  	return txq;
>  }
>  
> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +	struct bpf_prog *prog;
> +	u16 ret = 0;
> +
> +	rcu_read_lock();
> +	prog = rcu_dereference(tun->steering_prog);
> +	if (prog)
> +		ret = bpf_prog_run_clear_cb(prog, skb);
> +	rcu_read_unlock();
> +
> +	return ret % tun->numqueues;
> +}
>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>  			    void *accel_priv, select_queue_fallback_t fallback)
>  {
> @@ -2017,6 +2045,20 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	return ret;
>  }
>  
> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> +				    struct bpf_prog *new)
> +{
> +	struct bpf_prog *old;
> +
> +	old = rtnl_dereference(tun->steering_prog);
> +	rcu_assign_pointer(tun->steering_prog, new);
> +
> +	if (old) {
> +		synchronize_net();
> +		bpf_prog_destroy(old);
> +	}
> +}
> +

Is this really called under rtnl? If no then rtnl_dereference
is wrong. If yes I'm not sure you can call synchronize_net
under rtnl.

>  static void tun_free_netdev(struct net_device *dev)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> @@ -2025,6 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
>  	free_percpu(tun->pcpu_stats);
>  	tun_flow_uninit(tun);
>  	security_tun_dev_free_security(tun->security);
> +	__tun_set_steering_ebpf(tun, NULL);
>  }
>  
>  static void tun_setup(struct net_device *dev)
> @@ -2159,6 +2202,13 @@ static struct tun_steering_ops tun_automq_ops = {
>  	.post_rx = tun_automq_post_rx,
>  };
>  
> +static struct tun_steering_ops tun_ebpf_ops = {
> +	.select_queue = tun_ebpf_select_queue,
> +	.xmit = tun_steering_xmit_nop,
> +	.pre_rx = tun_steering_pre_rx_nop,
> +	.post_rx = tun_steering_post_rx_nop,
> +};
> +
>  static int tun_flags(struct tun_struct *tun)
>  {
>  	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
> @@ -2311,6 +2361,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		tun->filter_attached = false;
>  		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
>  		tun->rx_batched = 0;
> +		RCU_INIT_POINTER(tun->steering_prog, NULL);
>  
>  		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
>  		if (!tun->pcpu_stats) {
> @@ -2503,6 +2554,23 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  	return ret;
>  }
>  
> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> +{
> +	struct bpf_prog *prog;
> +	u32 fd;
> +
> +	if (copy_from_user(&fd, data, sizeof(fd)))
> +		return -EFAULT;
> +
> +	prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
> +	if (IS_ERR(prog))
> +		return PTR_ERR(prog);
> +
> +	__tun_set_steering_ebpf(tun, prog);
> +
> +	return 0;
> +}
> +
>  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			    unsigned long arg, int ifreq_len)
>  {
> @@ -2785,6 +2853,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		case TUN_STEERING_AUTOMQ:
>  			tun->steering_ops = &tun_automq_ops;
>  			break;
> +		case TUN_STEERING_EBPF:
> +			tun->steering_ops = &tun_ebpf_ops;
> +			break;
>  		default:
>  			ret = -EFAULT;
>  		}
> @@ -2794,6 +2865,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		ret = 0;
>  		if (tun->steering_ops == &tun_automq_ops)
>  			steering = TUN_STEERING_AUTOMQ;
> +		else if (tun->steering_ops == &tun_ebpf_ops)
> +			steering = TUN_STEERING_EBPF;
>  		else
>  			BUG();
>  		if (copy_to_user(argp, &steering, sizeof(steering)))
> @@ -2802,11 +2875,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  
>  	case TUNGETSTEERINGFEATURES:
>  		ret = 0;
> -		steering = TUN_STEERING_AUTOMQ;
> +		steering = TUN_STEERING_AUTOMQ | TUN_STEERING_EBPF;
>  		if (copy_to_user(argp, &steering, sizeof(steering)))
>  			ret = -EFAULT;
>  		break;
>  
> +	case TUNSETSTEERINGEBPF:
> +		ret = tun_set_steering_ebpf(tun, argp);
> +		break;
> +
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 109760e..927f7e4 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -59,6 +59,7 @@
>  #define TUNSETSTEERING _IOW('T', 224, unsigned int)
>  #define TUNGETSTEERING _IOR('T', 225, unsigned int)
>  #define TUNGETSTEERINGFEATURES _IOR('T', 226, unsigned int)
> +#define TUNSETSTEERINGEBPF _IOR('T', 227, int)
>  
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN		0x0001
> @@ -112,5 +113,6 @@ struct tun_filter {
>  };
>  
>  #define TUN_STEERING_AUTOMQ 0x01 /* Automatic flow steering */
> +#define TUN_STEERING_EBPF   0x02 /* eBPF based flow steering */
>  
>  #endif /* _UAPI__IF_TUN_H */
> -- 
> 2.7.4
Jason Wang Nov. 1, 2017, 1:02 p.m. | #2
On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>> +				    struct bpf_prog *new)
>> +{
>> +	struct bpf_prog *old;
>> +
>> +	old = rtnl_dereference(tun->steering_prog);
>> +	rcu_assign_pointer(tun->steering_prog, new);
>> +
>> +	if (old) {
>> +		synchronize_net();
>> +		bpf_prog_destroy(old);
>> +	}
>> +}
>> +
> Is this really called under rtnl?

Yes it is __tun_chr_ioctl() will call rtnl_lock().

> If no then rtnl_dereference
> is wrong. If yes I'm not sure you can call synchronize_net
> under rtnl.
>

Are you worrying about the long wait? Looking at synchronize_net(), it does:

void synchronize_net(void)
{
     might_sleep();
     if (rtnl_is_locked())
         synchronize_rcu_expedited();
     else
         synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

Thanks
Michael S. Tsirkin Nov. 1, 2017, 1:59 p.m. | #3
On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
> 
> 
> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
> > > +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> > > +				    struct bpf_prog *new)
> > > +{
> > > +	struct bpf_prog *old;
> > > +
> > > +	old = rtnl_dereference(tun->steering_prog);
> > > +	rcu_assign_pointer(tun->steering_prog, new);
> > > +
> > > +	if (old) {
> > > +		synchronize_net();
> > > +		bpf_prog_destroy(old);
> > > +	}
> > > +}
> > > +
> > Is this really called under rtnl?
> 
> Yes it is __tun_chr_ioctl() will call rtnl_lock().

Is the call from tun_free_netdev under rtnl too?

> > If no then rtnl_dereference
> > is wrong. If yes I'm not sure you can call synchronize_net
> > under rtnl.
> > 
> 
> Are you worrying about the long wait? Looking at synchronize_net(), it does:
> 
> void synchronize_net(void)
> {
>     might_sleep();
>     if (rtnl_is_locked())
>         synchronize_rcu_expedited();
>     else
>         synchronize_rcu();
> }
> EXPORT_SYMBOL(synchronize_net);
> 
> Thanks

Not the wait - expedited is not a good thing to allow unpriveledged
userspace to do, it interrupts all VMs running on the same box.

We could use a callback though the docs warn userspace can use that
to cause a DOS and needs to be limited.
Alexei Starovoitov Nov. 1, 2017, 7:12 p.m. | #4
On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
> > 
> > 
> > On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
> > > > +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> > > > +				    struct bpf_prog *new)
> > > > +{
> > > > +	struct bpf_prog *old;
> > > > +
> > > > +	old = rtnl_dereference(tun->steering_prog);
> > > > +	rcu_assign_pointer(tun->steering_prog, new);
> > > > +
> > > > +	if (old) {
> > > > +		synchronize_net();
> > > > +		bpf_prog_destroy(old);
> > > > +	}
> > > > +}
> > > > +
> > > Is this really called under rtnl?
> > 
> > Yes it is __tun_chr_ioctl() will call rtnl_lock().
> 
> Is the call from tun_free_netdev under rtnl too?
> 
> > > If no then rtnl_dereference
> > > is wrong. If yes I'm not sure you can call synchronize_net
> > > under rtnl.
> > > 
> > 
> > Are you worrying about the long wait? Looking at synchronize_net(), it does:
> > 
> > void synchronize_net(void)
> > {
> >     might_sleep();
> >     if (rtnl_is_locked())
> >         synchronize_rcu_expedited();
> >     else
> >         synchronize_rcu();
> > }
> > EXPORT_SYMBOL(synchronize_net);
> > 
> > Thanks
> 
> Not the wait - expedited is not a good thing to allow unpriveledged
> userspace to do, it interrupts all VMs running on the same box.
> 
> We could use a callback though the docs warn userspace can use that
> to cause a DOS and needs to be limited.

the whole __tun_set_steering_ebpf() looks odd to me.
There is tun_attach_filter/tun_detach_filter pattern
that works for classic BPF. Why for eBPF this strange
synchronize_net() is there?
Jason Wang Nov. 2, 2017, 3:24 a.m. | #5
On 2017年11月02日 03:12, Alexei Starovoitov wrote:
> On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:
>> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
>>>
>>> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>>>>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>>>>> +				    struct bpf_prog *new)
>>>>> +{
>>>>> +	struct bpf_prog *old;
>>>>> +
>>>>> +	old = rtnl_dereference(tun->steering_prog);
>>>>> +	rcu_assign_pointer(tun->steering_prog, new);
>>>>> +
>>>>> +	if (old) {
>>>>> +		synchronize_net();
>>>>> +		bpf_prog_destroy(old);
>>>>> +	}
>>>>> +}
>>>>> +
>>>> Is this really called under rtnl?
>>> Yes it is __tun_chr_ioctl() will call rtnl_lock().
>> Is the call from tun_free_netdev under rtnl too?
>>
>>>> If no then rtnl_dereference
>>>> is wrong. If yes I'm not sure you can call synchronize_net
>>>> under rtnl.
>>>>
>>> Are you worrying about the long wait? Looking at synchronize_net(), it does:
>>>
>>> void synchronize_net(void)
>>> {
>>>      might_sleep();
>>>      if (rtnl_is_locked())
>>>          synchronize_rcu_expedited();
>>>      else
>>>          synchronize_rcu();
>>> }
>>> EXPORT_SYMBOL(synchronize_net);
>>>
>>> Thanks
>> Not the wait - expedited is not a good thing to allow unpriveledged
>> userspace to do, it interrupts all VMs running on the same box.
>>
>> We could use a callback though the docs warn userspace can use that
>> to cause a DOS and needs to be limited.
> the whole __tun_set_steering_ebpf() looks odd to me.
> There is tun_attach_filter/tun_detach_filter pattern
> that works for classic BPF. Why for eBPF this strange
> synchronize_net() is there?
>

I'm not sure I get the question. eBPF here is used to do queue 
selection, so we could not reuse socket filter (tun_detach_filter use 
call_rcu()). cBPF could be used here, but I'm not quite sure it's worth 
to support it. And I agree we should use call_rcu() here.

Hope this answer your question.

Thanks
Jason Wang Nov. 2, 2017, 3:45 a.m. | #6
On 2017年11月01日 21:59, Michael S. Tsirkin wrote:
> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
>>
>> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>>>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>>>> +				    struct bpf_prog *new)
>>>> +{
>>>> +	struct bpf_prog *old;
>>>> +
>>>> +	old = rtnl_dereference(tun->steering_prog);
>>>> +	rcu_assign_pointer(tun->steering_prog, new);
>>>> +
>>>> +	if (old) {
>>>> +		synchronize_net();
>>>> +		bpf_prog_destroy(old);
>>>> +	}
>>>> +}
>>>> +
>>> Is this really called under rtnl?
>> Yes it is __tun_chr_ioctl() will call rtnl_lock().
> Is the call from tun_free_netdev under rtnl too?

Looks not, need hold rtnl_lock() in tun_free_netdev in next version.

>
>>> If no then rtnl_dereference
>>> is wrong. If yes I'm not sure you can call synchronize_net
>>> under rtnl.
>>>
>> Are you worrying about the long wait? Looking at synchronize_net(), it does:
>>
>> void synchronize_net(void)
>> {
>>      might_sleep();
>>      if (rtnl_is_locked())
>>          synchronize_rcu_expedited();
>>      else
>>          synchronize_rcu();
>> }
>> EXPORT_SYMBOL(synchronize_net);
>>
>> Thanks
> Not the wait - expedited is not a good thing to allow unpriveledged
> userspace to do, it interrupts all VMs running on the same box.

Good point.

>
> We could use a callback though the docs warn userspace can use that
> to cause a DOS and needs to be limited.
>
>

Will do this in next version.

Thanks
Willem de Bruijn Nov. 3, 2017, 8:56 a.m. | #7
On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces an eBPF based queue selection method based on
> the flow steering policy ops. Userspace could load an eBPF program
> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> to simple but hard coded policy in kernel.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> +{
> +       struct bpf_prog *prog;
> +       u32 fd;
> +
> +       if (copy_from_user(&fd, data, sizeof(fd)))
> +               return -EFAULT;
> +
> +       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);

If the idea is to allow guests to pass BPF programs down to the host,
you may want to define a new program type that is more restrictive than
socket filter.

The external functions allowed for socket filters (sk_filter_func_proto)
are relatively few (compared to, say, clsact), but may still leak host
information to a guest. More importantly, guest security considerations
limits how we can extend socket filters later.
Willem de Bruijn Nov. 3, 2017, 11:56 p.m. | #8
On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces an eBPF based queue selection method based on
>> the flow steering policy ops. Userspace could load an eBPF program
>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>> to simple but hard coded policy in kernel.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>> +{
>> +       struct bpf_prog *prog;
>> +       u32 fd;
>> +
>> +       if (copy_from_user(&fd, data, sizeof(fd)))
>> +               return -EFAULT;
>> +
>> +       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>
> If the idea is to allow guests to pass BPF programs down to the host,
> you may want to define a new program type that is more restrictive than
> socket filter.
>
> The external functions allowed for socket filters (sk_filter_func_proto)
> are relatively few (compared to, say, clsact), but may still leak host
> information to a guest. More importantly, guest security considerations
> limits how we can extend socket filters later.

Unless the idea is for the hypervisor to prepared the BPF based on a
limited set of well defined modes that the guest can configure. Then
socket filters are fine, as the BPF is prepared by a regular host process.
Jason Wang Nov. 8, 2017, 5:28 a.m. | #9
On 2017年11月04日 08:56, Willem de Bruijn wrote:
> On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>> This patch introduces an eBPF based queue selection method based on
>>> the flow steering policy ops. Userspace could load an eBPF program
>>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>>> to simple but hard coded policy in kernel.
>>>
>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>>> +{
>>> +       struct bpf_prog *prog;
>>> +       u32 fd;
>>> +
>>> +       if (copy_from_user(&fd, data, sizeof(fd)))
>>> +               return -EFAULT;
>>> +
>>> +       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>> If the idea is to allow guests to pass BPF programs down to the host,
>> you may want to define a new program type that is more restrictive than
>> socket filter.
>>
>> The external functions allowed for socket filters (sk_filter_func_proto)
>> are relatively few (compared to, say, clsact), but may still leak host
>> information to a guest. More importantly, guest security considerations
>> limits how we can extend socket filters later.
> Unless the idea is for the hypervisor to prepared the BPF based on a
> limited set of well defined modes that the guest can configure. Then
> socket filters are fine, as the BPF is prepared by a regular host process.

Yes, I think the idea is to let qemu to build a BPF program now.

Passing eBPF program from guest to host is interesting, but an obvious 
issue is how to deal with the accessing of map.

Thanks
Michael S. Tsirkin Nov. 8, 2017, 5:43 a.m. | #10
On Wed, Nov 08, 2017 at 02:28:53PM +0900, Jason Wang wrote:
> 
> 
> On 2017年11月04日 08:56, Willem de Bruijn wrote:
> > On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > > On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> > > > This patch introduces an eBPF based queue selection method based on
> > > > the flow steering policy ops. Userspace could load an eBPF program
> > > > through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> > > > to simple but hard coded policy in kernel.
> > > > 
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > ---
> > > > +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> > > > +{
> > > > +       struct bpf_prog *prog;
> > > > +       u32 fd;
> > > > +
> > > > +       if (copy_from_user(&fd, data, sizeof(fd)))
> > > > +               return -EFAULT;
> > > > +
> > > > +       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
> > > If the idea is to allow guests to pass BPF programs down to the host,
> > > you may want to define a new program type that is more restrictive than
> > > socket filter.
> > > 
> > > The external functions allowed for socket filters (sk_filter_func_proto)
> > > are relatively few (compared to, say, clsact), but may still leak host
> > > information to a guest. More importantly, guest security considerations
> > > limits how we can extend socket filters later.
> > Unless the idea is for the hypervisor to prepared the BPF based on a
> > limited set of well defined modes that the guest can configure. Then
> > socket filters are fine, as the BPF is prepared by a regular host process.
> 
> Yes, I think the idea is to let qemu to build a BPF program now.
> 
> Passing eBPF program from guest to host is interesting, but an obvious issue
> is how to deal with the accessing of map.
> 
> Thanks

Fundamentally, I suspect the way to solve it is to allow
the program to specify "should be offloaded to host".

And then it would access the host map rather than the guest map.

Then add some control path API for guest to poke at the host map.

It's not that there's anything special about the host map -
it's just separate from the guest - so if we wanted to
do something that can work on bare-metal we could -
just do something like a namespace and put all host
maps there. But I'm not sure it's worth the complexity.

Cc Aaron who wanted to look at this.
Jason Wang Nov. 8, 2017, 11:13 a.m. | #11
On 2017年11月08日 14:43, Michael S. Tsirkin wrote:
> On Wed, Nov 08, 2017 at 02:28:53PM +0900, Jason Wang wrote:
>>
>> On 2017年11月04日 08:56, Willem de Bruijn wrote:
>>> On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>>>> This patch introduces an eBPF based queue selection method based on
>>>>> the flow steering policy ops. Userspace could load an eBPF program
>>>>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>>>>> to simple but hard coded policy in kernel.
>>>>>
>>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>>> ---
>>>>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>>>>> +{
>>>>> +       struct bpf_prog *prog;
>>>>> +       u32 fd;
>>>>> +
>>>>> +       if (copy_from_user(&fd, data, sizeof(fd)))
>>>>> +               return -EFAULT;
>>>>> +
>>>>> +       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>>>> If the idea is to allow guests to pass BPF programs down to the host,
>>>> you may want to define a new program type that is more restrictive than
>>>> socket filter.
>>>>
>>>> The external functions allowed for socket filters (sk_filter_func_proto)
>>>> are relatively few (compared to, say, clsact), but may still leak host
>>>> information to a guest. More importantly, guest security considerations
>>>> limits how we can extend socket filters later.
>>> Unless the idea is for the hypervisor to prepared the BPF based on a
>>> limited set of well defined modes that the guest can configure. Then
>>> socket filters are fine, as the BPF is prepared by a regular host process.
>> Yes, I think the idea is to let qemu to build a BPF program now.
>>
>> Passing eBPF program from guest to host is interesting, but an obvious issue
>> is how to deal with the accessing of map.
>>
>> Thanks
> Fundamentally, I suspect the way to solve it is to allow
> the program to specify "should be offloaded to host".
>
> And then it would access the host map rather than the guest map.

This looks a big extension.

>
> Then add some control path API for guest to poke at the host map.

Actually, as Willem said, we can even forbid using map through a type, 
but this will lose lots of flexibility.

>
> It's not that there's anything special about the host map -
> it's just separate from the guest - so if we wanted to
> do something that can work on bare-metal we could -
> just do something like a namespace and put all host
> maps there. But I'm not sure it's worth the complexity.
>
> Cc Aaron who wanted to look at this.
>

Maybe the first step is to let classic BPF to be passed from guest and 
consider eBPF on top.

Thanks

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ab109ff..4bdde21 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -191,6 +191,20 @@  struct tun_steering_ops {
 			 u32 data);
 };
 
+void tun_steering_xmit_nop(struct tun_struct *tun, struct sk_buff *skb)
+{
+}
+
+u32 tun_steering_pre_rx_nop(struct tun_struct *tun, struct sk_buff *skb)
+{
+	return 0;
+}
+
+void tun_steering_post_rx_nop(struct tun_struct *tun, struct tun_file *tfile,
+			      u32 data)
+{
+}
+
 struct tun_flow_entry {
 	struct hlist_node hash_link;
 	struct rcu_head rcu;
@@ -241,6 +255,7 @@  struct tun_struct {
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
+	struct bpf_prog __rcu *steering_prog;
 	struct tun_steering_ops *steering_ops;
 };
 
@@ -576,6 +591,19 @@  static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 	return txq;
 }
 
+static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+{
+	struct bpf_prog *prog;
+	u16 ret = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(tun->steering_prog);
+	if (prog)
+		ret = bpf_prog_run_clear_cb(prog, skb);
+	rcu_read_unlock();
+
+	return ret % tun->numqueues;
+}
 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 			    void *accel_priv, select_queue_fallback_t fallback)
 {
@@ -2017,6 +2045,20 @@  static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
+static void __tun_set_steering_ebpf(struct tun_struct *tun,
+				    struct bpf_prog *new)
+{
+	struct bpf_prog *old;
+
+	old = rtnl_dereference(tun->steering_prog);
+	rcu_assign_pointer(tun->steering_prog, new);
+
+	if (old) {
+		synchronize_net();
+		bpf_prog_destroy(old);
+	}
+}
+
 static void tun_free_netdev(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -2025,6 +2067,7 @@  static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
+	__tun_set_steering_ebpf(tun, NULL);
 }
 
 static void tun_setup(struct net_device *dev)
@@ -2159,6 +2202,13 @@  static struct tun_steering_ops tun_automq_ops = {
 	.post_rx = tun_automq_post_rx,
 };
 
+static struct tun_steering_ops tun_ebpf_ops = {
+	.select_queue = tun_ebpf_select_queue,
+	.xmit = tun_steering_xmit_nop,
+	.pre_rx = tun_steering_pre_rx_nop,
+	.post_rx = tun_steering_post_rx_nop,
+};
+
 static int tun_flags(struct tun_struct *tun)
 {
 	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
@@ -2311,6 +2361,7 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun->filter_attached = false;
 		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
 		tun->rx_batched = 0;
+		RCU_INIT_POINTER(tun->steering_prog, NULL);
 
 		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
 		if (!tun->pcpu_stats) {
@@ -2503,6 +2554,23 @@  static int tun_set_queue(struct file *file, struct ifreq *ifr)
 	return ret;
 }
 
+static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+{
+	struct bpf_prog *prog;
+	u32 fd;
+
+	if (copy_from_user(&fd, data, sizeof(fd)))
+		return -EFAULT;
+
+	prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	__tun_set_steering_ebpf(tun, prog);
+
+	return 0;
+}
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
@@ -2785,6 +2853,9 @@  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		case TUN_STEERING_AUTOMQ:
 			tun->steering_ops = &tun_automq_ops;
 			break;
+		case TUN_STEERING_EBPF:
+			tun->steering_ops = &tun_ebpf_ops;
+			break;
 		default:
 			ret = -EFAULT;
 		}
@@ -2794,6 +2865,8 @@  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = 0;
 		if (tun->steering_ops == &tun_automq_ops)
 			steering = TUN_STEERING_AUTOMQ;
+		else if (tun->steering_ops == &tun_ebpf_ops)
+			steering = TUN_STEERING_EBPF;
 		else
 			BUG();
 		if (copy_to_user(argp, &steering, sizeof(steering)))
@@ -2802,11 +2875,15 @@  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 	case TUNGETSTEERINGFEATURES:
 		ret = 0;
-		steering = TUN_STEERING_AUTOMQ;
+		steering = TUN_STEERING_AUTOMQ | TUN_STEERING_EBPF;
 		if (copy_to_user(argp, &steering, sizeof(steering)))
 			ret = -EFAULT;
 		break;
 
+	case TUNSETSTEERINGEBPF:
+		ret = tun_set_steering_ebpf(tun, argp);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 109760e..927f7e4 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -59,6 +59,7 @@ 
 #define TUNSETSTEERING _IOW('T', 224, unsigned int)
 #define TUNGETSTEERING _IOR('T', 225, unsigned int)
 #define TUNGETSTEERINGFEATURES _IOR('T', 226, unsigned int)
+#define TUNSETSTEERINGEBPF _IOR('T', 227, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
@@ -112,5 +113,6 @@  struct tun_filter {
 };
 
 #define TUN_STEERING_AUTOMQ 0x01 /* Automatic flow steering */
+#define TUN_STEERING_EBPF   0x02 /* eBPF based flow steering */
 
 #endif /* _UAPI__IF_TUN_H */