diff mbox series

[RFC,v3,net-next,3/5] ebpf: fix bpf_msg_pull_data

Message ID 1534547305-25140-4-git-send-email-tushar.n.dave@oracle.com
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series eBPF and struct scatterlist | expand

Commit Message

Tushar Dave Aug. 17, 2018, 11:08 p.m. UTC
Like sockmap (sk_msg), socksg also deals with struct scatterlist
therefore socksg programs can use existing bpf helper bpf_msg_pull_data
to access packet data contained in struct scatterlist. While doing some
prelimnary testing, there are couple of issues found with
bpf_msg_pull_data that are fixed in this patch.

Also, there cannot be more than MAX_SKB_FRAGS entries in sg_data
therefore any checks for sg entry more than MAX_SKB_FRAGS in
bpf_msg_pull_data() is removed.

Besides that, I also ran into issues while put_page() is invoked.
e.g.
[ 450.568723] BUG: Bad page state in process swapper/10 pfn:2021540
[ 450.575632] page:ffffea0080855000 count:0 mapcount:0
mapping:ffff88103d006840 index:0xffff882021540000 compound_mapcount: 0
[ 450.588069] flags: 0x6fffff80008100(slab|head)
[ 450.593033] raw: 006fffff80008100 dead000000000100 dead000000000200
ffff88103d006840
[ 450.601683] raw: ffff882021540000 0000000080080007 00000000ffffffff
0000000000000000
[ 450.610337] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 450.617530] bad because of flags: 0x100(slab)

To avoid above issue, currently put_page() is disabled in this patch
temporarily. I am working on alternatives so that page allocated via
slab (in this case) can be freed without any issue.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 net/core/filter.c | 61 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

Comments

John Fastabend Aug. 25, 2018, 1:02 a.m. UTC | #1
On 08/17/2018 04:08 PM, Tushar Dave wrote:
> Like sockmap (sk_msg), socksg also deals with struct scatterlist
> therefore socksg programs can use existing bpf helper bpf_msg_pull_data
> to access packet data contained in struct scatterlist. While doing some
> prelimnary testing, there are couple of issues found with
> bpf_msg_pull_data that are fixed in this patch.
> 
> Also, there cannot be more than MAX_SKB_FRAGS entries in sg_data
> therefore any checks for sg entry more than MAX_SKB_FRAGS in
> bpf_msg_pull_data() is removed.

In sockmap the scatterlist is used as a ring so the MAX_SKB_FRAGS
check is needed to keep searching through the ring when sg_start
is non-zero.

> 
> Besides that, I also ran into issues while put_page() is invoked.
> e.g.
> [ 450.568723] BUG: Bad page state in process swapper/10 pfn:2021540
> [ 450.575632] page:ffffea0080855000 count:0 mapcount:0
> mapping:ffff88103d006840 index:0xffff882021540000 compound_mapcount: 0
> [ 450.588069] flags: 0x6fffff80008100(slab|head)
> [ 450.593033] raw: 006fffff80008100 dead000000000100 dead000000000200
> ffff88103d006840
> [ 450.601683] raw: ffff882021540000 0000000080080007 00000000ffffffff
> 0000000000000000
> [ 450.610337] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
> [ 450.617530] bad because of flags: 0x100(slab)
> 
> To avoid above issue, currently put_page() is disabled in this patch
> temporarily. I am working on alternatives so that page allocated via
> slab (in this case) can be freed without any issue.> 
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
>  net/core/filter.c | 61 +++++++++++++++++++++++++++++--------------------------
>  1 file changed, 32 insertions(+), 29 deletions(-)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e427c8e..cc52baa 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2316,7 +2316,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  BPF_CALL_4(bpf_msg_pull_data,
>  	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
>  {
> -	unsigned int len = 0, offset = 0, copy = 0;
> +	unsigned int len = 0, offset = 0, copy = 0, off = 0;
>  	struct scatterlist *sg = msg->sg_data;
>  	int first_sg, last_sg, i, shift;
>  	unsigned char *p, *to, *from;
> @@ -2330,22 +2330,28 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	i = msg->sg_start;
>  	do {
>  		len = sg[i].length;
> -		offset += len;
>  		if (start < offset + len)
>  			break;
> +		offset += len;

This looks like a generic fix unrelated to this series.
Can you send that as a bugfix?

>  		i++;
> -		if (i == MAX_SKB_FRAGS)
> -			i = 0;
> -	} while (i != msg->sg_end);
> +	} while (i <= msg->sg_end);
>  

As noted above the MAX_SKB_FRAGS check is needed because
sg_start can be non-zero and sg_end < st_start. In these
cases we need to search the entries at the start of the
array (being used as a ring).

> +	/* return error if start is out of range */
>  	if (unlikely(start >= offset + len))
>  		return -EINVAL;
>  
> -	if (!msg->sg_copy[i] && bytes <= len)
> -		goto out;
> +	/* return error if i is last entry in sglist and end is out of range */
> +	if (msg->sg_copy[i] && end > offset + len)
> +		return -EINVAL>  
>  	first_sg = i;
>  
> +	/* if i is not last entry in sg list and end (i.e start + bytes) is
> +	 * within this sg[i] then goto out and calculate data and data_end
> +	 */
> +	if (!msg->sg_copy[i] && end <= offset + len)
> +		goto out;
> +>  	/* At this point we need to linearize multiple scatterlist
>  	 * elements or a single shared page. Either way we need to
>  	 * copy into a linear buffer exclusively owned by BPF. Then
> @@ -2359,11 +2365,14 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	do {
>  		copy += sg[i].length;
>  		i++;
> -		if (i == MAX_SKB_FRAGS)
> -			i = 0;

same as above, need to keep.

> -		if (bytes < copy)
> +		if (end < copy)
>  			break;
> -	} while (i != msg->sg_end);
> +	} while (i <= msg->sg_end);
> +
> +	/* return error if i is last entry in sglist and end is out of range */
> +	if (i > msg->sg_end && end > offset + copy)
> +		return -EINVAL;
> +
>  	last_sg = i;
>  
>  	if (unlikely(copy < end - start))
> @@ -2373,23 +2382,25 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	if (unlikely(!page))
>  		return -ENOMEM;
>  	p = page_address(page);
> -	offset = 0;
>  
>  	i = first_sg;
>  	do {
>  		from = sg_virt(&sg[i]);
>  		len = sg[i].length;
> -		to = p + offset;
> +		to = p + off;

Not really sure if the change from offset->off is needed. Looks
like it just makes a bigger diff.

>  
>  		memcpy(to, from, len);
> -		offset += len;
> +		off += len;
>  		sg[i].length = 0;
> -		put_page(sg_page(&sg[i]));
> +		/* if original page is allocated via slab then put_page
> +		 * causes error BUG: Bad page state in process. So temporarily
> +		 * disabled put_page.
> +		 * Todo: fix it
> +		 */
> +		//put_page(sg_page(&sg[i]));
>  
>  		i++;
> -		if (i == MAX_SKB_FRAGS)
> -			i = 0;
> -	} while (i != last_sg);
> +	} while (i < last_sg);
>  
>  	sg[first_sg].length = copy;
>  	sg_set_page(&sg[first_sg], page, copy, 0);
> @@ -2406,12 +2417,8 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	do {
>  		int move_from;
>  
> -		if (i + shift >= MAX_SKB_FRAGS)
> -			move_from = i + shift - MAX_SKB_FRAGS;
> -		else
> -			move_from = i + shift;
> -

Need to keep same as above.

> -		if (move_from == msg->sg_end)
> +		move_from = i + shift;> +		if (move_from > msg->sg_end)
>  			break;
>  
>  		sg[i] = sg[move_from];
> @@ -2420,14 +2427,10 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  		sg[move_from].offset = 0;
>  
>  		i++;
> -		if (i == MAX_SKB_FRAGS)
> -			i = 0;
>  	} while (1);
>  	msg->sg_end -= shift;
> -	if (msg->sg_end < 0)
> -		msg->sg_end += MAX_SKB_FRAGS;
>  out:
> -	msg->data = sg_virt(&sg[i]) + start - offset;
> +	msg->data = sg_virt(&sg[first_sg]) + start - offset;
>  	msg->data_end = msg->data + bytes;
>  
>  	return 0;
> 

Thanks,
John
Tushar Dave Aug. 27, 2018, 4:45 a.m. UTC | #2
On 08/24/2018 06:02 PM, John Fastabend wrote:
> On 08/17/2018 04:08 PM, Tushar Dave wrote:
>> Like sockmap (sk_msg), socksg also deals with struct scatterlist
>> therefore socksg programs can use existing bpf helper bpf_msg_pull_data
>> to access packet data contained in struct scatterlist. While doing some
>> prelimnary testing, there are couple of issues found with
>> bpf_msg_pull_data that are fixed in this patch.
>>
>> Also, there cannot be more than MAX_SKB_FRAGS entries in sg_data
>> therefore any checks for sg entry more than MAX_SKB_FRAGS in
>> bpf_msg_pull_data() is removed.
> 
> In sockmap the scatterlist is used as a ring so the MAX_SKB_FRAGS
> check is needed to keep searching through the ring when sg_start
> is non-zero.

Okay.

> 
>>
>> Besides that, I also ran into issues while put_page() is invoked.
>> e.g.
>> [ 450.568723] BUG: Bad page state in process swapper/10 pfn:2021540
>> [ 450.575632] page:ffffea0080855000 count:0 mapcount:0
>> mapping:ffff88103d006840 index:0xffff882021540000 compound_mapcount: 0
>> [ 450.588069] flags: 0x6fffff80008100(slab|head)
>> [ 450.593033] raw: 006fffff80008100 dead000000000100 dead000000000200
>> ffff88103d006840
>> [ 450.601683] raw: ffff882021540000 0000000080080007 00000000ffffffff
>> 0000000000000000
>> [ 450.610337] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
>> [ 450.617530] bad because of flags: 0x100(slab)
>>
>> To avoid above issue, currently put_page() is disabled in this patch
>> temporarily. I am working on alternatives so that page allocated via
>> slab (in this case) can be freed without any issue.>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
>> ---
>>   net/core/filter.c | 61 +++++++++++++++++++++++++++++--------------------------
>>   1 file changed, 32 insertions(+), 29 deletions(-)
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index e427c8e..cc52baa 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2316,7 +2316,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   BPF_CALL_4(bpf_msg_pull_data,
>>   	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
>>   {
>> -	unsigned int len = 0, offset = 0, copy = 0;
>> +	unsigned int len = 0, offset = 0, copy = 0, off = 0;
>>   	struct scatterlist *sg = msg->sg_data;
>>   	int first_sg, last_sg, i, shift;
>>   	unsigned char *p, *to, *from;
>> @@ -2330,22 +2330,28 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   	i = msg->sg_start;
>>   	do {
>>   		len = sg[i].length;
>> -		offset += len;
>>   		if (start < offset + len)
>>   			break;
>> +		offset += len;
> 
> This looks like a generic fix unrelated to this series.
> Can you send that as a bugfix?

Okay.

> 
>>   		i++;
>> -		if (i == MAX_SKB_FRAGS)
>> -			i = 0;
>> -	} while (i != msg->sg_end);
>> +	} while (i <= msg->sg_end);
>>   
> 
> As noted above the MAX_SKB_FRAGS check is needed because
> sg_start can be non-zero and sg_end < st_start. In these
> cases we need to search the entries at the start of the
> array (being used as a ring).

Yup!

> 
>> +	/* return error if start is out of range */
>>   	if (unlikely(start >= offset + len))
>>   		return -EINVAL;
>>   
>> -	if (!msg->sg_copy[i] && bytes <= len)
>> -		goto out;
>> +	/* return error if i is last entry in sglist and end is out of range */
>> +	if (msg->sg_copy[i] && end > offset + len)
>> +		return -EINVAL>
>>   	first_sg = i;
>>   
>> +	/* if i is not last entry in sg list and end (i.e start + bytes) is
>> +	 * within this sg[i] then goto out and calculate data and data_end
>> +	 */
>> +	if (!msg->sg_copy[i] && end <= offset + len)
>> +		goto out;
>> +>  	/* At this point we need to linearize multiple scatterlist
>>   	 * elements or a single shared page. Either way we need to
>>   	 * copy into a linear buffer exclusively owned by BPF. Then
>> @@ -2359,11 +2365,14 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   	do {
>>   		copy += sg[i].length;
>>   		i++;
>> -		if (i == MAX_SKB_FRAGS)
>> -			i = 0;
> 
> same as above, need to keep.

Yup!

> 
>> -		if (bytes < copy)
>> +		if (end < copy)
>>   			break;
>> -	} while (i != msg->sg_end);
>> +	} while (i <= msg->sg_end);
>> +
>> +	/* return error if i is last entry in sglist and end is out of range */
>> +	if (i > msg->sg_end && end > offset + copy)
>> +		return -EINVAL;
>> +
>>   	last_sg = i;
>>   
>>   	if (unlikely(copy < end - start))
>> @@ -2373,23 +2382,25 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   	if (unlikely(!page))
>>   		return -ENOMEM;
>>   	p = page_address(page);
>> -	offset = 0;
>>   
>>   	i = first_sg;
>>   	do {
>>   		from = sg_virt(&sg[i]);
>>   		len = sg[i].length;
>> -		to = p + offset;
>> +		to = p + off;
> 
> Not really sure if the change from offset->off is needed. Looks
> like it just makes a bigger diff.

We need both offset and off because they both are used for different
calculations!

'offset' is used to calculate the 'msg->data'
i.e. msg->data = sg_virt(&sg[first_sg]) + start - offset"

'off' , on the other hand, is used for when we linearize sg.

> 
>>   
>>   		memcpy(to, from, len);
>> -		offset += len;
>> +		off += len;
>>   		sg[i].length = 0;
>> -		put_page(sg_page(&sg[i]));
>> +		/* if original page is allocated via slab then put_page
>> +		 * causes error BUG: Bad page state in process. So temporarily
>> +		 * disabled put_page.
>> +		 * Todo: fix it
>> +		 */
>> +		//put_page(sg_page(&sg[i]));

As I said in the commit message that put_page() causes error "BUG: Bad
page state in process ..." when used for RDS.
Any clue? Have you seen something like this with sockmap?


>>   
>>   		i++;
>> -		if (i == MAX_SKB_FRAGS)
>> -			i = 0;
>> -	} while (i != last_sg);
>> +	} while (i < last_sg);
>>   
>>   	sg[first_sg].length = copy;
>>   	sg_set_page(&sg[first_sg], page, copy, 0);
>> @@ -2406,12 +2417,8 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   	do {
>>   		int move_from;
>>   
>> -		if (i + shift >= MAX_SKB_FRAGS)
>> -			move_from = i + shift - MAX_SKB_FRAGS;
>> -		else
>> -			move_from = i + shift;
>> -
> 
> Need to keep same as above.
yup!

> 
>> -		if (move_from == msg->sg_end)
>> +		move_from = i + shift;> +		if (move_from > msg->sg_end)
>>   			break;
>>   
>>   		sg[i] = sg[move_from];
>> @@ -2420,14 +2427,10 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>   		sg[move_from].offset = 0;
>>   
>>   		i++;
>> -		if (i == MAX_SKB_FRAGS)
>> -			i = 0;
>>   	} while (1);
>>   	msg->sg_end -= shift;
>> -	if (msg->sg_end < 0)
>> -		msg->sg_end += MAX_SKB_FRAGS;
>>   out:
>> -	msg->data = sg_virt(&sg[i]) + start - offset;
>> +	msg->data = sg_virt(&sg[first_sg]) + start - offset;
>>   	msg->data_end = msg->data + bytes;
>>   
>>   	return 0;
>>
> 
> Thanks,
> John
> 

Thanks.
-Tushar
diff mbox series

Patch

diff --git a/net/core/filter.c b/net/core/filter.c
index e427c8e..cc52baa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2316,7 +2316,7 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 BPF_CALL_4(bpf_msg_pull_data,
 	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
 {
-	unsigned int len = 0, offset = 0, copy = 0;
+	unsigned int len = 0, offset = 0, copy = 0, off = 0;
 	struct scatterlist *sg = msg->sg_data;
 	int first_sg, last_sg, i, shift;
 	unsigned char *p, *to, *from;
@@ -2330,22 +2330,28 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	i = msg->sg_start;
 	do {
 		len = sg[i].length;
-		offset += len;
 		if (start < offset + len)
 			break;
+		offset += len;
 		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != msg->sg_end);
+	} while (i <= msg->sg_end);
 
+	/* return error if start is out of range */
 	if (unlikely(start >= offset + len))
 		return -EINVAL;
 
-	if (!msg->sg_copy[i] && bytes <= len)
-		goto out;
+	/* return error if i is last entry in sglist and end is out of range */
+	if (msg->sg_copy[i] && end > offset + len)
+		return -EINVAL;
 
 	first_sg = i;
 
+	/* if i is not last entry in sg list and end (i.e start + bytes) is
+	 * within this sg[i] then goto out and calculate data and data_end
+	 */
+	if (!msg->sg_copy[i] && end <= offset + len)
+		goto out;
+
 	/* At this point we need to linearize multiple scatterlist
 	 * elements or a single shared page. Either way we need to
 	 * copy into a linear buffer exclusively owned by BPF. Then
@@ -2359,11 +2365,14 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	do {
 		copy += sg[i].length;
 		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-		if (bytes < copy)
+		if (end < copy)
 			break;
-	} while (i != msg->sg_end);
+	} while (i <= msg->sg_end);
+
+	/* return error if i is last entry in sglist and end is out of range */
+	if (i > msg->sg_end && end > offset + copy)
+		return -EINVAL;
+
 	last_sg = i;
 
 	if (unlikely(copy < end - start))
@@ -2373,23 +2382,25 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	if (unlikely(!page))
 		return -ENOMEM;
 	p = page_address(page);
-	offset = 0;
 
 	i = first_sg;
 	do {
 		from = sg_virt(&sg[i]);
 		len = sg[i].length;
-		to = p + offset;
+		to = p + off;
 
 		memcpy(to, from, len);
-		offset += len;
+		off += len;
 		sg[i].length = 0;
-		put_page(sg_page(&sg[i]));
+		/* if original page is allocated via slab then put_page
+		 * causes error BUG: Bad page state in process. So temporarily
+		 * disabled put_page.
+		 * Todo: fix it
+		 */
+		//put_page(sg_page(&sg[i]));
 
 		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != last_sg);
+	} while (i < last_sg);
 
 	sg[first_sg].length = copy;
 	sg_set_page(&sg[first_sg], page, copy, 0);
@@ -2406,12 +2417,8 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	do {
 		int move_from;
 
-		if (i + shift >= MAX_SKB_FRAGS)
-			move_from = i + shift - MAX_SKB_FRAGS;
-		else
-			move_from = i + shift;
-
-		if (move_from == msg->sg_end)
+		move_from = i + shift;
+		if (move_from > msg->sg_end)
 			break;
 
 		sg[i] = sg[move_from];
@@ -2420,14 +2427,10 @@  struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 		sg[move_from].offset = 0;
 
 		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
 	} while (1);
 	msg->sg_end -= shift;
-	if (msg->sg_end < 0)
-		msg->sg_end += MAX_SKB_FRAGS;
 out:
-	msg->data = sg_virt(&sg[i]) + start - offset;
+	msg->data = sg_virt(&sg[first_sg]) + start - offset;
 	msg->data_end = msg->data + bytes;
 
 	return 0;