diff mbox series

[ovs-dev,net-next,v1,1/1] net: openvswitch: ovs_packet_cmd_execute put sw_flow mainbody in stack

Message ID OS3P286MB229509B8AD0264CC84F57845F5A69@OS3P286MB2295.JPNP286.PROD.OUTLOOK.COM
State Handled Elsewhere
Headers show
Series [ovs-dev,net-next,v1,1/1] net: openvswitch: ovs_packet_cmd_execute put sw_flow mainbody in stack | expand

Commit Message

缘 陶 Feb. 18, 2023, 6:53 a.m. UTC
Add 2 performance revisions for ovs_packet_cmd_execute

1.Stores mainbody of sw_flow(600+ bytes) in stack
  Benifit: avoid kmem cache alloc/free caused by ovs_flow_alloc/free

2.Define sw_flow_without_stats_init to initialize mainbody of
  struct sw_flow, which does not provides memory for sw_flow_stats.
  Reason: ovs_execute_actions does not touch sw_flow_stats.
  Benefit: less memzero, say each 'sw_flow_stats *' takes 4/8
  bytes, on systems with 20 to 128 logic cpus, this is a good deal.

Signed-off-by: Eddy Tao <taoyuan_eddy@hotmail.com>
---
 net/openvswitch/datapath.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

Comments

Simon Horman Feb. 19, 2023, 1:54 p.m. UTC | #1
On Sat, Feb 18, 2023 at 02:53:29PM +0800, Eddy Tao wrote:
> Add 2 performance revisions for ovs_packet_cmd_execute

I think that in general it's nicer to do one change per patch:
i.e. split this into two patches.

> 1.Stores mainbody of sw_flow(600+ bytes) in stack
>   Benifit: avoid kmem cache alloc/free caused by ovs_flow_alloc/free

Perhaps I am wrong, but 600 bytes seems like a lot of stack memory to consume.
And thus probably needs a strong justification.
Do you have some performance numbers showing a benefit of this change?

> 2.Define sw_flow_without_stats_init to initialize mainbody of
>   struct sw_flow, which does not provides memory for sw_flow_stats.
>   Reason: ovs_execute_actions does not touch sw_flow_stats.

Are there other code-paths that would also benefit from this change.

>   Benefit: less memzero, say each 'sw_flow_stats *' takes 4/8
>   bytes, on systems with 20 to 128 logic cpus, this is a good deal.

Less is more :)
Do you have some performance numbers showing a benefit of this change?

> Signed-off-by: Eddy Tao <taoyuan_eddy@hotmail.com>
> ---
>  net/openvswitch/datapath.c | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index fcee6012293b..337947d34355 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -589,6 +589,12 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>  	return err;
>  }
>  
> +static void sw_flow_without_stats_init(struct sw_flow *flow)
> +{
> +	memset(flow, 0, sizeof(*flow));
> +	flow->stats_last_writer = -1;
> +}
> +
>  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>  {
>  	struct ovs_header *ovs_header = info->userhdr;
> @@ -596,7 +602,8 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>  	struct nlattr **a = info->attrs;
>  	struct sw_flow_actions *acts;
>  	struct sk_buff *packet;
> -	struct sw_flow *flow;
> +	struct sw_flow f;
> +	struct sw_flow *flow = &f;

I'm not sure it's really useful to have both f and flow.
Could we just have the following?

	struct sw_flow *flow;

Also, it would be nice to move towards rather than away from
reverse xmas tree - longest line to shortest line - arrangement of local
variables in OVS code.

>  	struct sw_flow_actions *sf_acts;
>  	struct datapath *dp;
>  	struct vport *input_vport;
> @@ -636,20 +643,18 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>  	}
>  
>  	/* Build an sw_flow for sending this packet. */
> -	flow = ovs_flow_alloc();
> -	err = PTR_ERR(flow);
> -	if (IS_ERR(flow))
> -		goto err_kfree_skb;
> +	/* This flow has no sw_flow_stats */
> +	sw_flow_without_stats_init(flow);
>  
>  	err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
>  					     packet, &flow->key, log);
>  	if (err)
> -		goto err_flow_free;
> +		goto err_kfree_skb;
>  
>  	err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
>  				   &flow->key, &acts, log);
>  	if (err)
> -		goto err_flow_free;
> +		goto err_kfree_skb;
>  
>  	rcu_assign_pointer(flow->sf_acts, acts);
>  	packet->priority = flow->key.phy.priority;
> @@ -677,13 +682,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>  	local_bh_enable();
>  	rcu_read_unlock();
>  
> -	ovs_flow_free(flow, false);
>  	return err;
>  
>  err_unlock:
>  	rcu_read_unlock();
> -err_flow_free:
> -	ovs_flow_free(flow, false);
>  err_kfree_skb:
>  	kfree_skb(packet);
>  err:
> -- 
> 2.27.0
>
缘 陶 Feb. 19, 2023, 2:46 p.m. UTC | #2
Hi, Simon:

     Thanks for looking into this.

The revisions i proposed are complementary for the same purpose, and 
also reside in the same code segment.

I named them 2 items to clarify the details. Maybe it would be better to 
name them 2 steps in the same revision to avoid confusion.


And yes, i do have some performance result below

Testing topology

             |-----|
       nic1--|     |--nic1
       nic2--|     |--nic2
VM1(16cpus) | ovs |   VM2(16 cpus)
       nic3--|     |--nic3
       nic4--|     |--nic4
             |-----|
2 netperf client threads on each vnic

netperf -H $peer -p $((port+$i)) -t UDP_RR  -l 60 -- -R 1 -r 8K,8K
netperf -H $peer -p $((port+$i)) -t TCP_RR  -l 60 -- -R 1 -r 120,240
netperf -H $peer -p $((port+$i)) -t TCP_CRR -l 60 -- -R 1 -r 120,240

   Mode Iterations   Variance    Average

UDP_RR         10      %1.33      48472 ==> before the change
UDP_RR         10      %2.13      49130 ==> after  the change

TCP_RR         10      %4.56      79686 ==> before the change
TCP_RR         10      %3.42      79833 ==> after  the change

TCP_CRR        10      %0.16      20596 ==> before the change
TCP_CRR        10      %0.11      21179 ==> after  the change

Thanks

eddy
缘 陶 Feb. 19, 2023, 2:51 p.m. UTC | #3
>> Are there other code-paths that would also benefit from this change.

The change is focused on packets goes from user-space to data-path, I do not see other code-path that can benefit from this change

eddy
缘 陶 Feb. 20, 2023, 3:04 a.m. UTC | #4
Hi, Simon:

     To have better visibility of the effect of the patch, i did another 
test below

Disabling data-path flow installation to steer traffic to slow path 
only, thus I can observe the performance on slow path, where 
ovs_packet_cmd_execute is extensively used


Testing topology

             |-----|
       nic1--|     |--nic1
       nic2--|     |--nic2
VM1(16cpus) | ovs |   VM2(16 cpus)
       nic3--|     |--nic3
       nic4--|     |--nic4
             |-----|
2 netperf client threads on each vnic

netperf -H $peer -p $((port+$i)) -t TCP_STREAM  -l 60
netperf -H $peer -p $((port+$i)) -t TCP_RR  -l 60 -- -R 1 -r 120,240
netperf -H $peer -p $((port+$i)) -t TCP_CRR -l 60 -- -R 1 -r 120,240

   Mode Iterations   Variance    Average

TCP_STREAM     10      %3.83       1433 ==> before the change
TCP_STREAM     10      %3.39       1504 ==> after  the change

TCP_RR         10      %2.35      45145 ==> before the change
TCP_RR         10      %1.06      47250 ==> after  the change

TCP_CRR        10      %0.54      11310 ==> before the change
TCP_CRR        10      %2.64      12741 ==> after  the change


Considering the size and simplicity of the patch, i would say the 
performance benefit is decent.

Thanks

eddy
缘 陶 Feb. 20, 2023, 3:44 a.m. UTC | #5
More explanation to the meaning of performance data

 >>Mode Iterations   Variance    Average

Iterations: the number of executions of the same test case (each 
iteration get a performance value of perf[n] )

Average:  sum of all execution and then divided by 'n_iterations'. Below 
is the pseudo code

                for (sum=0,i=0;i<iterations;i++) sum+= perf[i];

                average = sum/iterations

Variance: A percentage value describing the overall deviation of 
performance results from average. Below is the pseudo code

                for (deviation=0,i=0;i<iterations;i++) deviation+= 
square(perf[i] - average);

                variance = ((square_root(deviation/iterations)) * 
100)/average


On 2023/2/20 11:04, Eddy Tao wrote:
> Hi, Simon:
>
>     To have better visibility of the effect of the patch, i did 
> another test below
>
> Disabling data-path flow installation to steer traffic to slow path 
> only, thus I can observe the performance on slow path, where 
> ovs_packet_cmd_execute is extensively used
>
>
> Testing topology
>
>             |-----|
>       nic1--|     |--nic1
>       nic2--|     |--nic2
> VM1(16cpus) | ovs |   VM2(16 cpus)
>       nic3--|     |--nic3
>       nic4--|     |--nic4
>             |-----|
> 2 netperf client threads on each vnic
>
> netperf -H $peer -p $((port+$i)) -t TCP_STREAM  -l 60
> netperf -H $peer -p $((port+$i)) -t TCP_RR  -l 60 -- -R 1 -r 120,240
> netperf -H $peer -p $((port+$i)) -t TCP_CRR -l 60 -- -R 1 -r 120,240
>
>   Mode Iterations   Variance    Average
>
> TCP_STREAM     10      %3.83       1433 ==> before the change
> TCP_STREAM     10      %3.39       1504 ==> after  the change
>
> TCP_RR         10      %2.35      45145 ==> before the change
> TCP_RR         10      %1.06      47250 ==> after  the change
>
> TCP_CRR        10      %0.54      11310 ==> before the change
> TCP_CRR        10      %2.64      12741 ==> after  the change
>
>
> Considering the size and simplicity of the patch, i would say the 
> performance benefit is decent.
>
> Thanks
>
> eddy
>
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
缘 陶 Feb. 20, 2023, 7:11 a.m. UTC | #6
Hi, Simon:


     About your concern for the stack size, it leads to more room for 
improvement.

I will file a new version which will have smaller stack occupation and 
better performance


The new revision is invoked by existing examples of using struct in 
stack, in the same file net/openvswitch/datapath.c

struct sw_flow_actions *get_flow_actions(..)
{
     struct sw_flow_key masked_key;==> sizeof sw_flow_key is 464 bytes

static noinline_for_stack int
ovs_nla_init_match_and_action(..)
{
     struct sw_flow_mask mask;==> sizeof sw_flow_mask is 496 bytes


The first example reminded me, revisiting the code in 
ovs_packet_cmd_execute, basically sw_flow serves as a container for 
sw_flow_actions and sw_flow_key only.

We do not need the bulk of tunnel info memory in sw_flow, which saves us 
200+ bytes further -- less is more.


The new revision will be presented shortly after some sanity and benchmark

eddy
Simon Horman Feb. 20, 2023, 10:37 a.m. UTC | #7
On Mon, Feb 20, 2023 at 03:11:17PM +0800, Eddy Tao wrote:
> Hi, Simon:
> 
> 
>     About your concern for the stack size, it leads to more room for
> improvement.
> 
> I will file a new version which will have smaller stack occupation and
> better performance
> 
> 
> The new revision is invoked by existing examples of using struct in stack,
> in the same file net/openvswitch/datapath.c
> 
> struct sw_flow_actions *get_flow_actions(..)
> {
>     struct sw_flow_key masked_key;==> sizeof sw_flow_key is 464 bytes
> 
> static noinline_for_stack int
> ovs_nla_init_match_and_action(..)
> {
>     struct sw_flow_mask mask;==> sizeof sw_flow_mask is 496 bytes
> 
> 
> The first example reminded me, revisiting the code in
> ovs_packet_cmd_execute, basically sw_flow serves as a container for
> sw_flow_actions and sw_flow_key only.
> 
> We do not need the bulk of tunnel info memory in sw_flow, which saves us
> 200+ bytes further -- less is more.
> 
> 
> The new revision will be presented shortly after some sanity and benchmark

Thanks, for addressing my review.
It is the stack size issue that is my greatest concern.

Please consider including performance results
in the patch description of v2.

Kind regards,
Simon
diff mbox series

Patch

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index fcee6012293b..337947d34355 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -589,6 +589,12 @@  static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	return err;
 }
 
+static void sw_flow_without_stats_init(struct sw_flow *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	flow->stats_last_writer = -1;
+}
+
 static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 {
 	struct ovs_header *ovs_header = info->userhdr;
@@ -596,7 +602,8 @@  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct nlattr **a = info->attrs;
 	struct sw_flow_actions *acts;
 	struct sk_buff *packet;
-	struct sw_flow *flow;
+	struct sw_flow f;
+	struct sw_flow *flow = &f;
 	struct sw_flow_actions *sf_acts;
 	struct datapath *dp;
 	struct vport *input_vport;
@@ -636,20 +643,18 @@  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* Build an sw_flow for sending this packet. */
-	flow = ovs_flow_alloc();
-	err = PTR_ERR(flow);
-	if (IS_ERR(flow))
-		goto err_kfree_skb;
+	/* This flow has no sw_flow_stats */
+	sw_flow_without_stats_init(flow);
 
 	err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
 					     packet, &flow->key, log);
 	if (err)
-		goto err_flow_free;
+		goto err_kfree_skb;
 
 	err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
 				   &flow->key, &acts, log);
 	if (err)
-		goto err_flow_free;
+		goto err_kfree_skb;
 
 	rcu_assign_pointer(flow->sf_acts, acts);
 	packet->priority = flow->key.phy.priority;
@@ -677,13 +682,10 @@  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	local_bh_enable();
 	rcu_read_unlock();
 
-	ovs_flow_free(flow, false);
 	return err;
 
 err_unlock:
 	rcu_read_unlock();
-err_flow_free:
-	ovs_flow_free(flow, false);
 err_kfree_skb:
 	kfree_skb(packet);
 err: