diff mbox series

[bpf-next,v2,08/18] bpf: sk_msg program helper bpf_sk_msg_pull_data

Message ID 20180312192345.8039.27774.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series bpf,sockmap: sendmsg/sendfile ULP | expand

Commit Message

John Fastabend March 12, 2018, 7:23 p.m. UTC
Currently, if a bpf sk msg program is run the program
can only parse data that the (start,end) pointers already
consumed. For sendmsg hooks this is likely the first
scatterlist element. For sendpage this will be the range
(0,0) because the data is shared with userspace and by
default we want to avoid allowing userspace to modify
data while (or after) BPF verdict is being decided.

To support pulling in additional bytes for parsing use
a new helper bpf_sk_msg_pull(start, end, flags) which
works similar to cls tc logic. This helper will attempt
to point the data start pointer at 'start' bytes offest
into msg and data end pointer at 'end' bytes offset into
message.

After basic sanity checks to ensure 'start' <= 'end' and
'end' <= msg_length there are a few cases we need to
handle.

First the sendmsg hook has already copied the data from
userspace and has exclusive access to it. Therefor, it
is not necessesary to copy the data. However, it may
be required. After finding the scatterlist element with
'start' offset byte in it there are two cases. One the
range (start,end) is entirely contained in the sg element
and is already linear. All that is needed is to update the
data pointers, no allocate/copy is needed. The other case
is (start, end) crosses sg element boundaries. In this
case we allocate a block of size 'end - start' and copy
the data to linearize it.

Next sendpage hook has not copied any data in initial
state so that data pointers are (0,0). In this case we
handle it similar to the above sendmsg case except the
allocation/copy must always happen. Then when sending
the data we have possibly three memory regions that
need to be sent, (0, start - 1), (start, end), and
(end + 1, msg_length). This is required to ensure any
writes by the BPF program are correctly transmitted.

Lastly this operation will invalidate any previous
data checks so BPF programs will have to revalidate
pointers after making this BPF call.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/uapi/linux/bpf.h |    3 +
 net/core/filter.c        |  133 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+), 2 deletions(-)

Comments

David Miller March 15, 2018, 6:42 p.m. UTC | #1
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 12 Mar 2018 12:23:45 -0700

> Currently, if a bpf sk msg program is run the program
> can only parse data that the (start,end) pointers already
> consumed. For sendmsg hooks this is likely the first
> scatterlist element. For sendpage this will be the range
> (0,0) because the data is shared with userspace and by
> default we want to avoid allowing userspace to modify
> data while (or after) BPF verdict is being decided.
> 
> To support pulling in additional bytes for parsing use
> a new helper bpf_sk_msg_pull(start, end, flags) which
> works similar to cls tc logic. This helper will attempt
> to point the data start pointer at 'start' bytes offest
> into msg and data end pointer at 'end' bytes offset into
> message.
 ...
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>

Yeah this looks really nice.

Acked-by: David S. Miller <davem@davemloft.net>
Daniel Borkmann March 15, 2018, 8:25 p.m. UTC | #2
On 03/12/2018 08:23 PM, John Fastabend wrote:
[...]
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 2c73af0..7b9e63e 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1956,6 +1956,134 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	.arg2_type      = ARG_ANYTHING,
>  };
>  
> +BPF_CALL_4(bpf_msg_pull_data,
> +	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
> +{
> +	unsigned int len = 0, offset = 0, copy = 0;
> +	struct scatterlist *sg = msg->sg_data;
> +	int first_sg, last_sg, i, shift;
> +	unsigned char *p, *to, *from;
> +	int bytes = end - start;
> +	struct page *page;
> +
> +	if (unlikely(end < start))
> +		return -EINVAL;

Actually should be:

if (unlikely(flags || end <= start))
	return -EINVAL;

> +	/* First find the starting scatterlist element */
> +	i = msg->sg_start;
> +	do {
> +		len = sg[i].length;
> +		offset += len;
> +		if (start < offset + len)
> +			break;
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (i != msg->sg_end);
> +
> +	if (unlikely(start >= offset + len))
> +		return -EINVAL;
> +
> +	if (!msg->sg_copy[i] && bytes <= len)
> +		goto out;
> +
> +	first_sg = i;
> +
> +	/* At this point we need to linearize multiple scatterlist
> +	 * elements or a single shared page. Either way we need to
> +	 * copy into a linear buffer exclusively owned by BPF. Then
> +	 * place the buffer in the scatterlist and fixup the original
> +	 * entries by removing the entries now in the linear buffer
> +	 * and shifting the remaining entries. For now we do not try
> +	 * to copy partial entries to avoid complexity of running out
> +	 * of sg_entry slots. The downside is reading a single byte
> +	 * will copy the entire sg entry.
> +	 */
> +	do {
> +		copy += sg[i].length;
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +		if (bytes < copy)
> +			break;
> +	} while (i != msg->sg_end);
> +	last_sg = i;
> +
> +	if (unlikely(copy < end - start))
> +		return -EINVAL;
> +
> +	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));

if (unlikely(!page))
	return -ENOMEM;

> +	p = page_address(page);
> +	offset = 0;
> +
> +	i = first_sg;
> +	do {
> +		from = sg_virt(&sg[i]);
> +		len = sg[i].length;
> +		to = p + offset;
> +
> +		memcpy(to, from, len);
> +		offset += len;
> +		sg[i].length = 0;
> +		put_page(sg_page(&sg[i]));
> +
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (i != last_sg);
> +
> +	sg[first_sg].length = copy;
> +	sg_set_page(&sg[first_sg], page, copy, 0);
> +
> +	/* To repair sg ring we need to shift entries. If we only
> +	 * had a single entry though we can just replace it and
> +	 * be done. Otherwise walk the ring and shift the entries.
> +	 */
> +	shift = last_sg - first_sg - 1;
> +	if (!shift)
> +		goto out;
> +
> +	i = first_sg + 1;
> +	do {
> +		int move_from;
> +
> +		if (i + shift >= MAX_SKB_FRAGS)
> +			move_from = i + shift - MAX_SKB_FRAGS;
> +		else
> +			move_from = i + shift;
> +
> +		if (move_from == msg->sg_end)
> +			break;
> +
> +		sg[i] = sg[move_from];
> +		sg[move_from].length = 0;
> +		sg[move_from].page_link = 0;
> +		sg[move_from].offset = 0;
> +
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (1);
> +	msg->sg_end -= shift;
> +	if (msg->sg_end < 0)
> +		msg->sg_end += MAX_SKB_FRAGS;
> +out:
> +	msg->data = sg_virt(&sg[i]) + start - offset;
> +	msg->data_end = msg->data + bytes;
> +
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_msg_pull_data_proto = {
> +	.func		= bpf_msg_pull_data,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_ANYTHING,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
>  {
>  	return task_get_classid(skb);
> @@ -2897,7 +3025,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>  	    func == bpf_l3_csum_replace ||
>  	    func == bpf_l4_csum_replace ||
>  	    func == bpf_xdp_adjust_head ||
> -	    func == bpf_xdp_adjust_meta)
> +	    func == bpf_xdp_adjust_meta ||
> +	    func == bpf_msg_pull_data)
>  		return true;
>  
>  	return false;
> @@ -3666,6 +3795,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
>  		return &bpf_msg_apply_bytes_proto;
>  	case BPF_FUNC_msg_cork_bytes:
>  		return &bpf_msg_cork_bytes_proto;
> +	case BPF_FUNC_msg_pull_data:
> +		return &bpf_msg_pull_data_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
>
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cfcc002..c9c4a0b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -771,7 +771,8 @@  enum bpf_attach_type {
 	FN(sock_ops_cb_flags_set),	\
 	FN(msg_redirect_map),		\
 	FN(msg_apply_bytes),		\
-	FN(msg_cork_bytes),
+	FN(msg_cork_bytes),		\
+	FN(msg_pull_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 2c73af0..7b9e63e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1956,6 +1956,134 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_pull_data,
+	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+	unsigned int len = 0, offset = 0, copy = 0;
+	struct scatterlist *sg = msg->sg_data;
+	int first_sg, last_sg, i, shift;
+	unsigned char *p, *to, *from;
+	int bytes = end - start;
+	struct page *page;
+
+	if (unlikely(end < start))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg_start;
+	do {
+		len = sg[i].length;
+		offset += len;
+		if (start < offset + len)
+			break;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != msg->sg_end);
+
+	if (unlikely(start >= offset + len))
+		return -EINVAL;
+
+	if (!msg->sg_copy[i] && bytes <= len)
+		goto out;
+
+	first_sg = i;
+
+	/* At this point we need to linearize multiple scatterlist
+	 * elements or a single shared page. Either way we need to
+	 * copy into a linear buffer exclusively owned by BPF. Then
+	 * place the buffer in the scatterlist and fixup the original
+	 * entries by removing the entries now in the linear buffer
+	 * and shifting the remaining entries. For now we do not try
+	 * to copy partial entries to avoid complexity of running out
+	 * of sg_entry slots. The downside is reading a single byte
+	 * will copy the entire sg entry.
+	 */
+	do {
+		copy += sg[i].length;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+		if (bytes < copy)
+			break;
+	} while (i != msg->sg_end);
+	last_sg = i;
+
+	if (unlikely(copy < end - start))
+		return -EINVAL;
+
+	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+	p = page_address(page);
+	offset = 0;
+
+	i = first_sg;
+	do {
+		from = sg_virt(&sg[i]);
+		len = sg[i].length;
+		to = p + offset;
+
+		memcpy(to, from, len);
+		offset += len;
+		sg[i].length = 0;
+		put_page(sg_page(&sg[i]));
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != last_sg);
+
+	sg[first_sg].length = copy;
+	sg_set_page(&sg[first_sg], page, copy, 0);
+
+	/* To repair sg ring we need to shift entries. If we only
+	 * had a single entry though we can just replace it and
+	 * be done. Otherwise walk the ring and shift the entries.
+	 */
+	shift = last_sg - first_sg - 1;
+	if (!shift)
+		goto out;
+
+	i = first_sg + 1;
+	do {
+		int move_from;
+
+		if (i + shift >= MAX_SKB_FRAGS)
+			move_from = i + shift - MAX_SKB_FRAGS;
+		else
+			move_from = i + shift;
+
+		if (move_from == msg->sg_end)
+			break;
+
+		sg[i] = sg[move_from];
+		sg[move_from].length = 0;
+		sg[move_from].page_link = 0;
+		sg[move_from].offset = 0;
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (1);
+	msg->sg_end -= shift;
+	if (msg->sg_end < 0)
+		msg->sg_end += MAX_SKB_FRAGS;
+out:
+	msg->data = sg_virt(&sg[i]) + start - offset;
+	msg->data_end = msg->data + bytes;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+	.func		= bpf_msg_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -2897,7 +3025,8 @@  bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta)
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data)
 		return true;
 
 	return false;
@@ -3666,6 +3795,8 @@  static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
 		return &bpf_msg_cork_bytes_proto;
+	case BPF_FUNC_msg_pull_data:
+		return &bpf_msg_pull_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}