[net-next] net: enable RPS on vlan devices

Message ID 1539132062-4348-1-git-send-email-shannon.nelson@oracle.com
State Under Review
Delegated to: David Miller
Headers show
Series
  • [net-next] net: enable RPS on vlan devices
Related show

Commit Message

Shannon Nelson Oct. 10, 2018, 12:41 a.m.
From: Silviu Smarandache <silviu.smarandache@oracle.com>

This patch modifies the RPS processing code so that it searches
for a matching vlan interface on the packet and then uses the
RPS settings of the vlan interface.  If no vlan interface
is found or the vlan interface does not have RPS enabled,
it will fall back to the RPS settings of the underlying device.

In supporting VMs where we can't control the OS being used,
we'd like to separate the VM cpu processing from the host's
cpus as a way to help mitigate the impact of the L1TF issue.
When running the VM's traffic on a vlan we can stick the Rx
processing on one set of cpus separate from the VM's cpus.
Yes, choosing to use this may cause a bit of throughput pain
when the packets are actually passed into the VM and have to
move from one cache to another.

Orabug: 28645929

Signed-off-by: Silviu Smarandache <silviu.smarandache@oracle.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 net/core/dev.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

Comments

Eric Dumazet Oct. 10, 2018, 1:04 a.m. | #1
On 10/09/2018 05:41 PM, Shannon Nelson wrote:
> From: Silviu Smarandache <silviu.smarandache@oracle.com>
> 
> This patch modifies the RPS processing code so that it searches
> for a matching vlan interface on the packet and then uses the
> RPS settings of the vlan interface.  If no vlan interface
> is found or the vlan interface does not have RPS enabled,
> it will fall back to the RPS settings of the underlying device.
> 
> In supporting VMs where we can't control the OS being used,
> we'd like to separate the VM cpu processing from the host's
> cpus as a way to help mitigate the impact of the L1TF issue.
> When running the VM's traffic on a vlan we can stick the Rx
> processing on one set of cpus separate from the VM's cpus.
> Yes, choosing to use this may cause a bit of throughput pain
> when the packets are actually passed into the VM and have to
> move from one cache to another.
> 
> Orabug: 28645929
> 
> Signed-off-by: Silviu Smarandache <silviu.smarandache@oracle.com>
> Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
> ---
>  net/core/dev.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0b2d777..1da3f63 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3971,8 +3971,8 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>   * CPU from the RPS map of the receiving queue for a given skb.
>   * rcu_read_lock must be held on entry.
>   */
> -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> -		       struct rps_dev_flow **rflowp)
> +static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> +			 struct rps_dev_flow **rflowp)
>  {
>  	const struct rps_sock_flow_table *sock_flow_table;
>  	struct netdev_rx_queue *rxqueue = dev->_rx;
> @@ -4066,6 +4066,35 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>  	return cpu;
>  }
>  
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> +		       struct rps_dev_flow **rflowp)
> +{
> +	/* Check for a vlan device with RPS settings */
> +	if (skb_vlan_tag_present(skb)) {
> +		struct net_device *vdev;
> +		u16 vid;
> +
> +		vid = skb_vlan_tag_get_id(skb);
> +		vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
> +		if (vdev) {
> +			/* recorded queue is not referring to the vlan device.
> +			 * Save and restore it
> +			 */
> +			int cpu;
> +			u16 queue_mapping = skb_get_queue_mapping(skb);
> +
> +			skb_set_queue_mapping(skb, 0);
> +			cpu = __get_rps_cpu(vdev, skb, rflowp);
> +			skb_set_queue_mapping(skb, queue_mapping);

This is really ugly :/

Also what makes vlan so special compared to say macvlan ?
Shannon Nelson Oct. 10, 2018, 2:11 a.m. | #2
On 10/9/2018 6:04 PM, Eric Dumazet wrote:
> 
> On 10/09/2018 05:41 PM, Shannon Nelson wrote:
>> From: Silviu Smarandache <silviu.smarandache@oracle.com>
>>
>> This patch modifies the RPS processing code so that it searches
>> for a matching vlan interface on the packet and then uses the
>> RPS settings of the vlan interface.  If no vlan interface
>> is found or the vlan interface does not have RPS enabled,
>> it will fall back to the RPS settings of the underlying device.
>>
>> In supporting VMs where we can't control the OS being used,
>> we'd like to separate the VM cpu processing from the host's
>> cpus as a way to help mitigate the impact of the L1TF issue.
>> When running the VM's traffic on a vlan we can stick the Rx
>> processing on one set of cpus separate from the VM's cpus.
>> Yes, choosing to use this may cause a bit of throughput pain
>> when the packets are actually passed into the VM and have to
>> move from one cache to another.
>>
>> Orabug: 28645929
>>
>> Signed-off-by: Silviu Smarandache <silviu.smarandache@oracle.com>
>> Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
>> ---
>>   net/core/dev.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 0b2d777..1da3f63 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3971,8 +3971,8 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>>    * CPU from the RPS map of the receiving queue for a given skb.
>>    * rcu_read_lock must be held on entry.
>>    */
>> -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>> -		       struct rps_dev_flow **rflowp)
>> +static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>> +			 struct rps_dev_flow **rflowp)
>>   {
>>   	const struct rps_sock_flow_table *sock_flow_table;
>>   	struct netdev_rx_queue *rxqueue = dev->_rx;
>> @@ -4066,6 +4066,35 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>>   	return cpu;
>>   }
>>   
>> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>> +		       struct rps_dev_flow **rflowp)
>> +{
>> +	/* Check for a vlan device with RPS settings */
>> +	if (skb_vlan_tag_present(skb)) {
>> +		struct net_device *vdev;
>> +		u16 vid;
>> +
>> +		vid = skb_vlan_tag_get_id(skb);
>> +		vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
>> +		if (vdev) {
>> +			/* recorded queue is not referring to the vlan device.
>> +			 * Save and restore it
>> +			 */
>> +			int cpu;
>> +			u16 queue_mapping = skb_get_queue_mapping(skb);
>> +
>> +			skb_set_queue_mapping(skb, 0);
>> +			cpu = __get_rps_cpu(vdev, skb, rflowp);
>> +			skb_set_queue_mapping(skb, queue_mapping);
> 
> This is really ugly :/

Hence the reason we sent this as an RFC a couple of weeks ago.  We got 
no response, so followed up with this patch in order to get some input. 
Do you have any suggestions for how we might accomplish this in a less 
ugly way?

> 
> Also what makes vlan so special compared to say macvlan ?

Only that vlan was the itch that Silviu needed to scratch.  If we can 
solve this for vlan, then perhaps we'll have a template to follow for 
other upper devices.

sln
Eric Dumazet Oct. 10, 2018, 2:17 a.m. | #3
On 10/09/2018 07:11 PM, Shannon Nelson wrote:
> 
> Hence the reason we sent this as an RFC a couple of weeks ago.  We got no response, so followed up with this patch in order to get some input. Do you have any suggestions for how we might accomplish this in a less ugly way?

I dunno, maybe a modern way for all these very specific needs would be to use an eBPF
hook to implement whatever combination of RPS/RFS/what_have_you

Then, we no longer have to review what various strategies are used by users.
Shannon Nelson Oct. 10, 2018, 4:18 p.m. | #4
On 10/9/2018 7:17 PM, Eric Dumazet wrote:
> 
> 
> On 10/09/2018 07:11 PM, Shannon Nelson wrote:
>>
>> Hence the reason we sent this as an RFC a couple of weeks ago.  We got no response, so followed up with this patch in order to get some input. Do you have any suggestions for how we might accomplish this in a less ugly way?
> 
> I dunno, maybe a modern way for all these very specific needs would be to use an eBPF
> hook to implement whatever combination of RPS/RFS/what_have_you
> 
> Then, we no longer have to review what various strategies are used by users.

We're trying to make use of an existing useful feature that was designed 
for exactly this kind of problem.  It is already there and no new user 
training is needed.  We're actually fixing what could arguably be called 
a bug since the /sys/class/net/<dev>/queues/rx-0/rps_cpus entry exists 
for vlan devices but currently doesn't do anything.  We're also 
addressing a security concern related to the recent L1TF excitement.

For this case, we want to target the network stack processing to happen 
on a certain subset of CPUs.  With admittedly only a cursory look 
through eBPF, I don't see an obvious way to target the packet processing 
to an alternative CPU, unless we add yet another field to the skb that 
eBPF/XDP could fill and then query that field in the same time as we 
currently check get_rps_cpu().  But adding to the skb is usually frowned 
upon unless absolutely necessary, and this seems like a duplication of 
what we already have with RPS, so why add a competing feature?

Back to my earlier question: are there any suggestions for how we might 
accomplish this in a less ugly way?

sln
Eric Dumazet Oct. 10, 2018, 5:14 p.m. | #5
On 10/10/2018 09:18 AM, Shannon Nelson wrote:
> On 10/9/2018 7:17 PM, Eric Dumazet wrote:
>>
>>
>> On 10/09/2018 07:11 PM, Shannon Nelson wrote:
>>>
>>> Hence the reason we sent this as an RFC a couple of weeks ago.  We got no response, so followed up with this patch in order to get some input. Do you have any suggestions for how we might accomplish this in a less ugly way?
>>
>> I dunno, maybe a modern way for all these very specific needs would be to use an eBPF
>> hook to implement whatever combination of RPS/RFS/what_have_you
>>
>> Then, we no longer have to review what various strategies are used by users.
> 
> We're trying to make use of an existing useful feature that was designed for exactly this kind of problem.  It is already there and no new user training is needed.  We're actually fixing what could arguably be called a bug since the /sys/class/net/<dev>/queues/rx-0/rps_cpus entry exists for vlan devices but currently doesn't do anything.  We're also addressing a security concern related to the recent L1TF excitement.
> 
> For this case, we want to target the network stack processing to happen on a certain subset of CPUs.  With admittedly only a cursory look through eBPF, I don't see an obvious way to target the packet processing to an alternative CPU, unless we add yet another field to the skb that eBPF/XDP could fill and then query that field in the same time as we currently check get_rps_cpu().  But adding to the skb is usually frowned upon unless absolutely necessary, and this seems like a duplication of what we already have with RPS, so why add a competing feature?
> 
> Back to my earlier question: are there any suggestions for how we might accomplish this in a less ugly way?


What if you want to have efficient multi queue processing ?
The Vlan device could have multiple RX queues, but you forced queue_mapping=0

Honestly, RPS & RFS show their age and complexity (look at net/core/net-sysfs.c ...)

We should not expand it, we should put in place a new infrastructure, fully expandable.
With socket lookups, we even can avoid having a hashtable for flow information, removing
one cache miss, and removing flow collisions.

eBPF seems perfect to me.

It is time that we stop adding core infra that most users do not need/use.
(RPS and RFS are default off)
John Fastabend Oct. 10, 2018, 5:37 p.m. | #6
On 10/10/2018 10:14 AM, Eric Dumazet wrote:
> 
> 
> On 10/10/2018 09:18 AM, Shannon Nelson wrote:
>> On 10/9/2018 7:17 PM, Eric Dumazet wrote:
>>>
>>>
>>> On 10/09/2018 07:11 PM, Shannon Nelson wrote:
>>>>
>>>> Hence the reason we sent this as an RFC a couple of weeks ago.  We got no response, so followed up with this patch in order to get some input. Do you have any suggestions for how we might accomplish this in a less ugly way?
>>>
>>> I dunno, maybe a modern way for all these very specific needs would be to use an eBPF
>>> hook to implement whatever combination of RPS/RFS/what_have_you
>>>
>>> Then, we no longer have to review what various strategies are used by users.
>>
>> We're trying to make use of an existing useful feature that was designed for exactly this kind of problem.  It is already there and no new user training is needed.  We're actually fixing what could arguably be called a bug since the /sys/class/net/<dev>/queues/rx-0/rps_cpus entry exists for vlan devices but currently doesn't do anything.  We're also addressing a security concern related to the recent L1TF excitement.
>>
>> For this case, we want to target the network stack processing to happen on a certain subset of CPUs.  With admittedly only a cursory look through eBPF, I don't see an obvious way to target the packet processing to an alternative CPU, unless we add yet another field to the skb that eBPF/XDP could fill and then query that field in the same time as we currently check get_rps_cpu().  But adding to the skb is usually frowned upon unless absolutely necessary, and this seems like a duplication of what we already have with RPS, so why add a competing feature?
>>
>> Back to my earlier question: are there any suggestions for how we might accomplish this in a less ugly way?
> 
> 
> What if you want to have efficient multi queue processing ?
> The Vlan device could have multiple RX queues, but you forced queue_mapping=0
> 
> Honestly, RPS & RFS show their age and complexity (look at net/core/net-sysfs.c ...)
> 
> We should not expand it, we should put in place a new infrastructure, fully expandable.
> With socket lookups, we even can avoid having a hashtable for flow information, removing
> one cache miss, and removing flow collisions.
> 
> eBPF seems perfect to me.
> 

Latest tree has a sk_lookup() helper supported in 'tc' layer now
to lookup the socket. And XDP has support for a "cpumap" object
that allows redirect to remote CPUs. Neither was specifically
designed for this but I suspect with some extra work these might
be what is needed.

I would start by looking at bpf_sk_lookup() in filter.c and the
cpumap type in ./kernel/bpf/cpumap.c, also in general sk_lookup
from XDP layer will likely be needed shortly anyways.

> It is time that we stop adding core infra that most users do not need/use.
> (RPS and RFS are default off)
>
Shannon Nelson Oct. 10, 2018, 6:23 p.m. | #7
On 10/10/2018 10:14 AM, Eric Dumazet wrote:
> 
> On 10/10/2018 09:18 AM, Shannon Nelson wrote:
>> On 10/9/2018 7:17 PM, Eric Dumazet wrote:
>>>
>>>
>>> On 10/09/2018 07:11 PM, Shannon Nelson wrote:
>>>>
>>>> Hence the reason we sent this as an RFC a couple of weeks ago.  We got no response, so followed up with this patch in order to get some input. Do you have any suggestions for how we might accomplish this in a less ugly way?
>>>
>>> I dunno, maybe a modern way for all these very specific needs would be to use an eBPF
>>> hook to implement whatever combination of RPS/RFS/what_have_you
>>>
>>> Then, we no longer have to review what various strategies are used by users.
>>
>> We're trying to make use of an existing useful feature that was designed for exactly this kind of problem.  It is already there and no new user training is needed.  We're actually fixing what could arguably be called a bug since the /sys/class/net/<dev>/queues/rx-0/rps_cpus entry exists for vlan devices but currently doesn't do anything.  We're also addressing a security concern related to the recent L1TF excitement.
>>
>> For this case, we want to target the network stack processing to happen on a certain subset of CPUs.  With admittedly only a cursory look through eBPF, I don't see an obvious way to target the packet processing to an alternative CPU, unless we add yet another field to the skb that eBPF/XDP could fill and then query that field in the same time as we currently check get_rps_cpu().  But adding to the skb is usually frowned upon unless absolutely necessary, and this seems like a duplication of what we already have with RPS, so why add a competing feature?
>>
>> Back to my earlier question: are there any suggestions for how we might accomplish this in a less ugly way?
> 
> 
> What if you want to have efficient multi queue processing ?
> The Vlan device could have multiple RX queues, but you forced queue_mapping=0

This would be easy enough to change to a simple modulus of a recorded 
queue mapping with the number of queues in the vlan device.  We can fix 
that.

> 
> Honestly, RPS & RFS show their age and complexity (look at net/core/net-sysfs.c ...)
> 
> We should not expand it, we should put in place a new infrastructure, fully expandable.
> With socket lookups, we even can avoid having a hashtable for flow information, removing
> one cache miss, and removing flow collisions.
> 
> eBPF seems perfect to me.

And yet specifying CPUs for processing is exactly what RPS was designed for.

> 
> It is time that we stop adding core infra that most users do not need/use.
> (RPS and RFS are default off)

For that matter, lots of features are "default off" until someone 
configures them, and there are lots of features that are only used by a 
subset of users.  In this case, we're trying to use something that 
already exists and arguably is broken for the vlan case.  Are there some 
technical reasons why this is not a good solution?

sln
Shannon Nelson Oct. 10, 2018, 6:25 p.m. | #8
On 10/10/2018 10:37 AM, John Fastabend wrote:
> Latest tree has a sk_lookup() helper supported in 'tc' layer now
> to lookup the socket. And XDP has support for a "cpumap" object
> that allows redirect to remote CPUs. Neither was specifically
> designed for this but I suspect with some extra work these might
> be what is needed.
> 
> I would start by looking at bpf_sk_lookup() in filter.c and the
> cpumap type in ./kernel/bpf/cpumap.c, also in general sk_lookup
> from XDP layer will likely be needed shortly anyways.

Thanks, John, for pointing to something I can start looking at.  My 
customer still wants to use the RPS that they already know, but I'll 
start looking into how this also might work for us.

sln
Eric Dumazet Oct. 10, 2018, 6:32 p.m. | #9
On 10/10/2018 11:23 AM, Shannon Nelson wrote:
> For that matter, lots of features are "default off" until someone configures them, and there are lots of features that are only used by a subset of users.  In this case, we're trying to use something that already exists and arguably is broken for the vlan case.  Are there some technical reasons why this is not a good solution?

Simply code maintenance, and added cost for non vlan users.

Adding extra tests might help your use-case, but is slowing down other cases.

Since the need for new stuff will never end, really eBPF allow for arbitrary usage,
without the need for review kernel patches and maintaining it for next 10 years.

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index 0b2d777..1da3f63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3971,8 +3971,8 @@  set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
  * CPU from the RPS map of the receiving queue for a given skb.
  * rcu_read_lock must be held on entry.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-		       struct rps_dev_flow **rflowp)
+static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+			 struct rps_dev_flow **rflowp)
 {
 	const struct rps_sock_flow_table *sock_flow_table;
 	struct netdev_rx_queue *rxqueue = dev->_rx;
@@ -4066,6 +4066,35 @@  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	return cpu;
 }
 
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+		       struct rps_dev_flow **rflowp)
+{
+	/* Check for a vlan device with RPS settings */
+	if (skb_vlan_tag_present(skb)) {
+		struct net_device *vdev;
+		u16 vid;
+
+		vid = skb_vlan_tag_get_id(skb);
+		vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
+		if (vdev) {
+			/* recorded queue is not referring to the vlan device.
+			 * Save and restore it
+			 */
+			int cpu;
+			u16 queue_mapping = skb_get_queue_mapping(skb);
+
+			skb_set_queue_mapping(skb, 0);
+			cpu = __get_rps_cpu(vdev, skb, rflowp);
+			skb_set_queue_mapping(skb, queue_mapping);
+			if (cpu != -1)
+				return cpu;
+		}
+	}
+
+	/* Fall back to RPS settings of original device */
+	return __get_rps_cpu(dev, skb, rflowp);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 /**
@@ -4437,12 +4466,23 @@  static int netif_rx_internal(struct sk_buff *skb)
 		preempt_disable();
 		rcu_read_lock();
 
+		/* strip any vlan tag before calling get_rps_cpu() */
+		if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+		    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+			skb = skb_vlan_untag(skb);
+			if (unlikely(!skb)) {
+				ret = NET_RX_DROP;
+				goto unlock;
+			}
+		}
+
 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 		if (cpu < 0)
 			cpu = smp_processor_id();
 
 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
+unlock:
 		rcu_read_unlock();
 		preempt_enable();
 	} else
@@ -5095,8 +5135,19 @@  static int netif_receive_skb_internal(struct sk_buff *skb)
 #ifdef CONFIG_RPS
 	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
-		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		int cpu;
+
+		/* strip any vlan tag before calling get_rps_cpu() */
+		if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+		    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+			skb = skb_vlan_untag(skb);
+			if (unlikely(!skb)) {
+				ret = NET_RX_DROP;
+				goto out;
+			}
+		}
 
+		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 		if (cpu >= 0) {
 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 			rcu_read_unlock();
@@ -5105,6 +5156,8 @@  static int netif_receive_skb_internal(struct sk_buff *skb)
 	}
 #endif
 	ret = __netif_receive_skb(skb);
+
+out:
 	rcu_read_unlock();
 	return ret;
 }