[ovs-dev,ovn] ovn-controller.c: Avoid adding neighbour flows for non-local datapaths.
diff mbox series

Message ID 1582068734-70719-1-git-send-email-hzhou@ovn.org
State New
Headers show
Series
  • [ovs-dev,ovn] ovn-controller.c: Avoid adding neighbour flows for non-local datapaths.
Related show

Commit Message

Han Zhou Feb. 18, 2020, 11:32 p.m. UTC
This is usefule when external_ids:ovn-monitor-all is set to true.

Signed-off-by: Han Zhou <hzhou@ovn.org>
---
 controller/lflow.c          | 15 +++++++++++----
 controller/lflow.h          |  1 +
 controller/ovn-controller.c |  6 +++++-
 3 files changed, 17 insertions(+), 5 deletions(-)

Comments

Dumitru Ceara Feb. 19, 2020, 12:50 p.m. UTC | #1
On 2/19/20 12:32 AM, Han Zhou wrote:
> This is usefule when external_ids:ovn-monitor-all is set to true.
> 
> Signed-off-by: Han Zhou <hzhou@ovn.org>

Hi Han,

Looks good to me.

Acked-by: Dumitru Ceara <dceara@redhat.com>

I also tested this (together with your previous patch) on a scaled setup
with 150 ovn-fake-multinode nodes and ovn-monitor-all enabled.

With OVN master I see high CPU usage on ovn-controllers from time to time:

ovn-netlab1-64/ovn-controller.log:2020-02-19T12:14:11.896Z|00017|timeval|WARN|Unreasonably
long 1087ms poll interval (224ms user, 14ms system)
ovn-netlab1-140/ovn-controller.log:2020-02-19T12:14:12.030Z|00017|timeval|WARN|Unreasonably
long 1055ms poll interval (241ms user, 11ms system)
ovn-netlab1-69/ovn-controller.log:2020-02-19T12:14:11.856Z|00017|timeval|WARN|Unreasonably
long 1019ms poll interval (221ms user, 1ms system)
ovn-netlab1-25/ovn-controller.log:2020-02-19T12:14:11.857Z|00017|timeval|WARN|Unreasonably
long 1053ms poll interval (230ms user, 9ms system)
ovn-netlab1-48/ovn-controller.log:2020-02-19T12:14:11.827Z|00017|timeval|WARN|Unreasonably
long 1005ms poll interval (245ms user, 22ms system)
ovn-netlab1-80/ovn-controller.log:2020-02-19T12:14:11.936Z|00017|timeval|WARN|Unreasonably
long 1127ms poll interval (218ms user, 2ms system)
ovn-netlab1-56/ovn-controller.log:2020-02-19T12:14:01.202Z|00017|timeval|WARN|Unreasonably
long 1016ms poll interval (224ms user, 0ms system)
ovn-netlab1-24/ovn-controller.log:2020-02-19T12:14:22.623Z|00017|timeval|WARN|Unreasonably
long 1022ms poll interval (227ms user, 1ms system)
ovn-netlab1-65/ovn-controller.log:2020-02-19T12:13:19.585Z|00017|timeval|WARN|Unreasonably
long 1012ms poll interval (213ms user, 1ms system)
ovn-netlab1-46/ovn-controller.log:2020-02-19T12:14:11.893Z|00017|timeval|WARN|Unreasonably
long 1086ms poll interval (225ms user, 0ms system)
ovn-netlab1-21/ovn-controller.log:2020-02-19T12:13:19.586Z|00017|timeval|WARN|Unreasonably
long 1031ms poll interval (222ms user, 0ms system)

With your changes this happens less often:
./localhost/ovn-netlab1-63/ovn-controller.log:2020-02-19T12:46:10.204Z|00017|timeval|WARN|Unreasonably
long 1038ms poll interval (223ms user, 1ms system)
./localhost/ovn-netlab1-67/ovn-controller.log:2020-02-19T12:45:59.677Z|00017|timeval|WARN|Unreasonably
long 1033ms poll interval (215ms user, 0ms system)
./localhost/ovn-netlab1-96/ovn-controller.log:2020-02-19T12:46:10.261Z|00017|timeval|WARN|Unreasonably
long 1009ms poll interval (219ms user, 1ms system)
./localhost/ovn-netlab1-43/ovn-controller.log:2020-02-19T12:46:10.194Z|00017|timeval|WARN|Unreasonably
long 1044ms poll interval (222ms user, 0ms system)
./localhost/ovn-netlab1-58/ovn-controller.log:2020-02-19T12:46:10.253Z|00017|timeval|WARN|Unreasonably
long 1091ms poll interval (225ms user, 12ms system)
./localhost/ovn-netlab1-95/ovn-controller.log:2020-02-19T12:46:10.246Z|00017|timeval|WARN|Unreasonably
long 1031ms poll interval (216ms user, 16ms system)


Regards,
Dumitru

> ---
>  controller/lflow.c          | 15 +++++++++++----
>  controller/lflow.h          |  1 +
>  controller/ovn-controller.c |  6 +++++-
>  3 files changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/controller/lflow.c b/controller/lflow.c
> index 79d8913..3c10a16 100644
> --- a/controller/lflow.c
> +++ b/controller/lflow.c
> @@ -792,12 +792,14 @@ put_load(const uint8_t *data, size_t len,
>  
>  static void
>  consider_neighbor_flow(struct ovsdb_idl_index *sbrec_port_binding_by_name,
> +                       const struct hmap *local_datapaths,
>                         const struct sbrec_mac_binding *b,
>                         struct ovn_desired_flow_table *flow_table)
>  {
>      const struct sbrec_port_binding *pb
>          = lport_lookup_by_name(sbrec_port_binding_by_name, b->logical_port);
> -    if (!pb) {
> +    if (!pb || !get_local_datapath(local_datapaths,
> +                                   pb->datapath->tunnel_key)) {
>          return;
>      }
>  
> @@ -869,11 +871,13 @@ consider_neighbor_flow(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>  static void
>  add_neighbor_flows(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                     const struct sbrec_mac_binding_table *mac_binding_table,
> +                   const struct hmap *local_datapaths,
>                     struct ovn_desired_flow_table *flow_table)
>  {
>      const struct sbrec_mac_binding *b;
>      SBREC_MAC_BINDING_TABLE_FOR_EACH (b, mac_binding_table) {
> -        consider_neighbor_flow(sbrec_port_binding_by_name, b, flow_table);
> +        consider_neighbor_flow(sbrec_port_binding_by_name, local_datapaths,
> +                               b, flow_table);
>      }
>  }
>  
> @@ -882,6 +886,7 @@ void
>  lflow_handle_changed_neighbors(
>      struct ovsdb_idl_index *sbrec_port_binding_by_name,
>      const struct sbrec_mac_binding_table *mac_binding_table,
> +    const struct hmap *local_datapaths,
>      struct ovn_desired_flow_table *flow_table)
>  {
>      const struct sbrec_mac_binding *mb;
> @@ -904,7 +909,8 @@ lflow_handle_changed_neighbors(
>              }
>              VLOG_DBG("handle new mac_binding "UUID_FMT,
>                       UUID_ARGS(&mb->header_.uuid));
> -            consider_neighbor_flow(sbrec_port_binding_by_name, mb, flow_table);
> +            consider_neighbor_flow(sbrec_port_binding_by_name, local_datapaths,
> +                                   mb, flow_table);
>          }
>      }
>  }
> @@ -934,7 +940,8 @@ lflow_run(struct lflow_ctx_in *l_ctx_in, struct lflow_ctx_out *l_ctx_out)
>  
>      add_logical_flows(l_ctx_in, l_ctx_out);
>      add_neighbor_flows(l_ctx_in->sbrec_port_binding_by_name,
> -                       l_ctx_in->mac_binding_table, l_ctx_out->flow_table);
> +                       l_ctx_in->mac_binding_table, l_ctx_in->local_datapaths,
> +                       l_ctx_out->flow_table);
>  }
>  
>  void
> diff --git a/controller/lflow.h b/controller/lflow.h
> index 8433cc0..f02f709 100644
> --- a/controller/lflow.h
> +++ b/controller/lflow.h
> @@ -150,6 +150,7 @@ bool lflow_handle_changed_ref(enum ref_type, const char *ref_name,
>  void lflow_handle_changed_neighbors(
>      struct ovsdb_idl_index *sbrec_port_binding_by_name,
>      const struct sbrec_mac_binding_table *,
> +    const struct hmap *local_datapaths,
>      struct ovn_desired_flow_table *);
>  
>  void lflow_destroy(void);
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 4d245ca..e7773b8 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -1504,11 +1504,15 @@ flow_output_sb_mac_binding_handler(struct engine_node *node, void *data)
>          (struct sbrec_mac_binding_table *)EN_OVSDB_GET(
>              engine_get_input("SB_mac_binding", node));
>  
> +    struct ed_type_runtime_data *rt_data =
> +        engine_get_input_data("runtime_data", node);
> +    const struct hmap *local_datapaths = &rt_data->local_datapaths;
> +
>      struct ed_type_flow_output *fo = data;
>      struct ovn_desired_flow_table *flow_table = &fo->flow_table;
>  
>      lflow_handle_changed_neighbors(sbrec_port_binding_by_name,
> -            mac_binding_table, flow_table);
> +            mac_binding_table, local_datapaths, flow_table);
>  
>      engine_set_node_state(node, EN_UPDATED);
>      return true;
>
Han Zhou Feb. 19, 2020, 5:36 p.m. UTC | #2
On Wed, Feb 19, 2020 at 4:50 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 2/19/20 12:32 AM, Han Zhou wrote:
> > This is usefule when external_ids:ovn-monitor-all is set to true.
> >
> > Signed-off-by: Han Zhou <hzhou@ovn.org>
>
> Hi Han,
>
> Looks good to me.
>
> Acked-by: Dumitru Ceara <dceara@redhat.com>
>
> I also tested this (together with your previous patch) on a scaled setup
> with 150 ovn-fake-multinode nodes and ovn-monitor-all enabled.
>
> With OVN master I see high CPU usage on ovn-controllers from time to time:
>
>
ovn-netlab1-64/ovn-controller.log:2020-02-19T12:14:11.896Z|00017|timeval|WARN|Unreasonably
> long 1087ms poll interval (224ms user, 14ms system)
>
ovn-netlab1-140/ovn-controller.log:2020-02-19T12:14:12.030Z|00017|timeval|WARN|Unreasonably
> long 1055ms poll interval (241ms user, 11ms system)
>
ovn-netlab1-69/ovn-controller.log:2020-02-19T12:14:11.856Z|00017|timeval|WARN|Unreasonably
> long 1019ms poll interval (221ms user, 1ms system)
>
ovn-netlab1-25/ovn-controller.log:2020-02-19T12:14:11.857Z|00017|timeval|WARN|Unreasonably
> long 1053ms poll interval (230ms user, 9ms system)
>
ovn-netlab1-48/ovn-controller.log:2020-02-19T12:14:11.827Z|00017|timeval|WARN|Unreasonably
> long 1005ms poll interval (245ms user, 22ms system)
>
ovn-netlab1-80/ovn-controller.log:2020-02-19T12:14:11.936Z|00017|timeval|WARN|Unreasonably
> long 1127ms poll interval (218ms user, 2ms system)
>
ovn-netlab1-56/ovn-controller.log:2020-02-19T12:14:01.202Z|00017|timeval|WARN|Unreasonably
> long 1016ms poll interval (224ms user, 0ms system)
>
ovn-netlab1-24/ovn-controller.log:2020-02-19T12:14:22.623Z|00017|timeval|WARN|Unreasonably
> long 1022ms poll interval (227ms user, 1ms system)
>
ovn-netlab1-65/ovn-controller.log:2020-02-19T12:13:19.585Z|00017|timeval|WARN|Unreasonably
> long 1012ms poll interval (213ms user, 1ms system)
>
ovn-netlab1-46/ovn-controller.log:2020-02-19T12:14:11.893Z|00017|timeval|WARN|Unreasonably
> long 1086ms poll interval (225ms user, 0ms system)
>
ovn-netlab1-21/ovn-controller.log:2020-02-19T12:13:19.586Z|00017|timeval|WARN|Unreasonably
> long 1031ms poll interval (222ms user, 0ms system)
>
> With your changes this happens less often:
>
./localhost/ovn-netlab1-63/ovn-controller.log:2020-02-19T12:46:10.204Z|00017|timeval|WARN|Unreasonably
> long 1038ms poll interval (223ms user, 1ms system)
>
./localhost/ovn-netlab1-67/ovn-controller.log:2020-02-19T12:45:59.677Z|00017|timeval|WARN|Unreasonably
> long 1033ms poll interval (215ms user, 0ms system)
>
./localhost/ovn-netlab1-96/ovn-controller.log:2020-02-19T12:46:10.261Z|00017|timeval|WARN|Unreasonably
> long 1009ms poll interval (219ms user, 1ms system)
>
./localhost/ovn-netlab1-43/ovn-controller.log:2020-02-19T12:46:10.194Z|00017|timeval|WARN|Unreasonably
> long 1044ms poll interval (222ms user, 0ms system)
>
./localhost/ovn-netlab1-58/ovn-controller.log:2020-02-19T12:46:10.253Z|00017|timeval|WARN|Unreasonably
> long 1091ms poll interval (225ms user, 12ms system)
>
./localhost/ovn-netlab1-95/ovn-controller.log:2020-02-19T12:46:10.246Z|00017|timeval|WARN|Unreasonably
> long 1031ms poll interval (216ms user, 16ms system)
>
>
> Regards,
> Dumitru
>
Thanks Dumitru for reviewing and testing it out.
Are you seeing high CPU only after applying this patch? In theory I think
this patch should not contribute to CPU spike.
Enabling ovn-monitor-all can result in higher CPU in ovn-controller in
circumstances when not all datapaths are local. In your test case, is the
topology ideal for ovn-monitor-all? I.e. does each node cares about all
datapaths? If the answer is yes, then could you try enabling
ovn-monitor-all only on half of the nodes, and see if the nodes with
ovn-monitor-all enabled are with higher CPU than others?

In addition, did you see any difference of CPU usage on SB DB?

Thanks,
Han
Dumitru Ceara Feb. 20, 2020, 9:58 a.m. UTC | #3
On 2/19/20 6:36 PM, Han Zhou wrote:
> 
> 
> On Wed, Feb 19, 2020 at 4:50 AM Dumitru Ceara <dceara@redhat.com
> <mailto:dceara@redhat.com>> wrote:
>>
>> On 2/19/20 12:32 AM, Han Zhou wrote:
>> > This is usefule when external_ids:ovn-monitor-all is set to true.
>> >
>> > Signed-off-by: Han Zhou <hzhou@ovn.org <mailto:hzhou@ovn.org>>
>>
>> Hi Han,
>>
>> Looks good to me.
>>
>> Acked-by: Dumitru Ceara <dceara@redhat.com <mailto:dceara@redhat.com>>
>>
>> I also tested this (together with your previous patch) on a scaled setup
>> with 150 ovn-fake-multinode nodes and ovn-monitor-all enabled.
>>
>> With OVN master I see high CPU usage on ovn-controllers from time to time:
>>
>>
> ovn-netlab1-64/ovn-controller.log:2020-02-19T12:14:11.896Z|00017|timeval|WARN|Unreasonably
>> long 1087ms poll interval (224ms user, 14ms system)
>>
> ovn-netlab1-140/ovn-controller.log:2020-02-19T12:14:12.030Z|00017|timeval|WARN|Unreasonably
>> long 1055ms poll interval (241ms user, 11ms system)
>>
> ovn-netlab1-69/ovn-controller.log:2020-02-19T12:14:11.856Z|00017|timeval|WARN|Unreasonably
>> long 1019ms poll interval (221ms user, 1ms system)
>>
> ovn-netlab1-25/ovn-controller.log:2020-02-19T12:14:11.857Z|00017|timeval|WARN|Unreasonably
>> long 1053ms poll interval (230ms user, 9ms system)
>>
> ovn-netlab1-48/ovn-controller.log:2020-02-19T12:14:11.827Z|00017|timeval|WARN|Unreasonably
>> long 1005ms poll interval (245ms user, 22ms system)
>>
> ovn-netlab1-80/ovn-controller.log:2020-02-19T12:14:11.936Z|00017|timeval|WARN|Unreasonably
>> long 1127ms poll interval (218ms user, 2ms system)
>>
> ovn-netlab1-56/ovn-controller.log:2020-02-19T12:14:01.202Z|00017|timeval|WARN|Unreasonably
>> long 1016ms poll interval (224ms user, 0ms system)
>>
> ovn-netlab1-24/ovn-controller.log:2020-02-19T12:14:22.623Z|00017|timeval|WARN|Unreasonably
>> long 1022ms poll interval (227ms user, 1ms system)
>>
> ovn-netlab1-65/ovn-controller.log:2020-02-19T12:13:19.585Z|00017|timeval|WARN|Unreasonably
>> long 1012ms poll interval (213ms user, 1ms system)
>>
> ovn-netlab1-46/ovn-controller.log:2020-02-19T12:14:11.893Z|00017|timeval|WARN|Unreasonably
>> long 1086ms poll interval (225ms user, 0ms system)
>>
> ovn-netlab1-21/ovn-controller.log:2020-02-19T12:13:19.586Z|00017|timeval|WARN|Unreasonably
>> long 1031ms poll interval (222ms user, 0ms system)
>>
>> With your changes this happens less often:
>>
> ./localhost/ovn-netlab1-63/ovn-controller.log:2020-02-19T12:46:10.204Z|00017|timeval|WARN|Unreasonably
>> long 1038ms poll interval (223ms user, 1ms system)
>>
> ./localhost/ovn-netlab1-67/ovn-controller.log:2020-02-19T12:45:59.677Z|00017|timeval|WARN|Unreasonably
>> long 1033ms poll interval (215ms user, 0ms system)
>>
> ./localhost/ovn-netlab1-96/ovn-controller.log:2020-02-19T12:46:10.261Z|00017|timeval|WARN|Unreasonably
>> long 1009ms poll interval (219ms user, 1ms system)
>>
> ./localhost/ovn-netlab1-43/ovn-controller.log:2020-02-19T12:46:10.194Z|00017|timeval|WARN|Unreasonably
>> long 1044ms poll interval (222ms user, 0ms system)
>>
> ./localhost/ovn-netlab1-58/ovn-controller.log:2020-02-19T12:46:10.253Z|00017|timeval|WARN|Unreasonably
>> long 1091ms poll interval (225ms user, 12ms system)
>>
> ./localhost/ovn-netlab1-95/ovn-controller.log:2020-02-19T12:46:10.246Z|00017|timeval|WARN|Unreasonably
>> long 1031ms poll interval (216ms user, 16ms system)
>>
>>
>> Regards,
>> Dumitru
>>
> Thanks Dumitru for reviewing and testing it out.
> Are you seeing high CPU only after applying this patch? In theory I
> think this patch should not contribute to CPU spike.
> Enabling ovn-monitor-all can result in higher CPU in ovn-controller in
> circumstances when not all datapaths are local. In your test case, is
> the topology ideal for ovn-monitor-all? I.e. does each node cares about
> all datapaths? If the answer is yes, then could you try enabling
> ovn-monitor-all only on half of the nodes, and see if the nodes with
> ovn-monitor-all enabled are with higher CPU than others?
> 

Hi Han,

In my test topology all datapaths are local (i.e., all logical switches
are connected to a single cluster logical router).

The test machine I used initially was oversubscribed so I ran the tests
again on a setup with more physical machines:

1. With OVN master, ovn-monitor-all=false, bringing up 300 nodes (300
logical switches + one VIF per switch):
- SB DB CPU usage is high after a certain number of nodes come up.
Running perf on the setup points to ovsdb_monitor_get_update that takes
up to 70% CPU time (including children). This due to each ovn-controller
subscribing to OVSDB updates for all datapaths individually.
- ovn-controller CPU usage is normal, i.e., no visible CPU spikes.

2. With OVN master, ovn-monitor-all=true, bringing up 300 nodes:
- SB DB CPU usage is low, no visible CPU spikes.
- ovn-controller CPU usage is normal as well.

3. With OVN master + your patches, ovn-monitor-all=true, bringing up 300
nodes:
- SB DB CPU usage is low, no visible CPU spikes.
- ovn-controller CPU usage is normal as well.

In conclusion all seems fine to me and even in the worst case scenario,
when all datapaths are local, ovn-controller cpu usage is not affected
by the extra datapath lookups introduced by your changes.

Thanks,
Dumitru

> In addition, did you see any difference of CPU usage on SB DB?
> 
> Thanks,
> Han
Han Zhou Feb. 20, 2020, 7:50 p.m. UTC | #4
On Thu, Feb 20, 2020 at 1:58 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 2/19/20 6:36 PM, Han Zhou wrote:
> >
> >
> > On Wed, Feb 19, 2020 at 4:50 AM Dumitru Ceara <dceara@redhat.com
> > <mailto:dceara@redhat.com>> wrote:
> >>
> >> On 2/19/20 12:32 AM, Han Zhou wrote:
> >> > This is usefule when external_ids:ovn-monitor-all is set to true.
> >> >
> >> > Signed-off-by: Han Zhou <hzhou@ovn.org <mailto:hzhou@ovn.org>>
> >>
> >> Hi Han,
> >>
> >> Looks good to me.
> >>
> >> Acked-by: Dumitru Ceara <dceara@redhat.com <mailto:dceara@redhat.com>>
> >>
> >> I also tested this (together with your previous patch) on a scaled
setup
> >> with 150 ovn-fake-multinode nodes and ovn-monitor-all enabled.
> >>
> >> With OVN master I see high CPU usage on ovn-controllers from time to
time:
> >>
> >>
> >
ovn-netlab1-64/ovn-controller.log:2020-02-19T12:14:11.896Z|00017|timeval|WARN|Unreasonably
> >> long 1087ms poll interval (224ms user, 14ms system)
> >>
> >
ovn-netlab1-140/ovn-controller.log:2020-02-19T12:14:12.030Z|00017|timeval|WARN|Unreasonably
> >> long 1055ms poll interval (241ms user, 11ms system)
> >>
> >
ovn-netlab1-69/ovn-controller.log:2020-02-19T12:14:11.856Z|00017|timeval|WARN|Unreasonably
> >> long 1019ms poll interval (221ms user, 1ms system)
> >>
> >
ovn-netlab1-25/ovn-controller.log:2020-02-19T12:14:11.857Z|00017|timeval|WARN|Unreasonably
> >> long 1053ms poll interval (230ms user, 9ms system)
> >>
> >
ovn-netlab1-48/ovn-controller.log:2020-02-19T12:14:11.827Z|00017|timeval|WARN|Unreasonably
> >> long 1005ms poll interval (245ms user, 22ms system)
> >>
> >
ovn-netlab1-80/ovn-controller.log:2020-02-19T12:14:11.936Z|00017|timeval|WARN|Unreasonably
> >> long 1127ms poll interval (218ms user, 2ms system)
> >>
> >
ovn-netlab1-56/ovn-controller.log:2020-02-19T12:14:01.202Z|00017|timeval|WARN|Unreasonably
> >> long 1016ms poll interval (224ms user, 0ms system)
> >>
> >
ovn-netlab1-24/ovn-controller.log:2020-02-19T12:14:22.623Z|00017|timeval|WARN|Unreasonably
> >> long 1022ms poll interval (227ms user, 1ms system)
> >>
> >
ovn-netlab1-65/ovn-controller.log:2020-02-19T12:13:19.585Z|00017|timeval|WARN|Unreasonably
> >> long 1012ms poll interval (213ms user, 1ms system)
> >>
> >
ovn-netlab1-46/ovn-controller.log:2020-02-19T12:14:11.893Z|00017|timeval|WARN|Unreasonably
> >> long 1086ms poll interval (225ms user, 0ms system)
> >>
> >
ovn-netlab1-21/ovn-controller.log:2020-02-19T12:13:19.586Z|00017|timeval|WARN|Unreasonably
> >> long 1031ms poll interval (222ms user, 0ms system)
> >>
> >> With your changes this happens less often:
> >>
> >
./localhost/ovn-netlab1-63/ovn-controller.log:2020-02-19T12:46:10.204Z|00017|timeval|WARN|Unreasonably
> >> long 1038ms poll interval (223ms user, 1ms system)
> >>
> >
./localhost/ovn-netlab1-67/ovn-controller.log:2020-02-19T12:45:59.677Z|00017|timeval|WARN|Unreasonably
> >> long 1033ms poll interval (215ms user, 0ms system)
> >>
> >
./localhost/ovn-netlab1-96/ovn-controller.log:2020-02-19T12:46:10.261Z|00017|timeval|WARN|Unreasonably
> >> long 1009ms poll interval (219ms user, 1ms system)
> >>
> >
./localhost/ovn-netlab1-43/ovn-controller.log:2020-02-19T12:46:10.194Z|00017|timeval|WARN|Unreasonably
> >> long 1044ms poll interval (222ms user, 0ms system)
> >>
> >
./localhost/ovn-netlab1-58/ovn-controller.log:2020-02-19T12:46:10.253Z|00017|timeval|WARN|Unreasonably
> >> long 1091ms poll interval (225ms user, 12ms system)
> >>
> >
./localhost/ovn-netlab1-95/ovn-controller.log:2020-02-19T12:46:10.246Z|00017|timeval|WARN|Unreasonably
> >> long 1031ms poll interval (216ms user, 16ms system)
> >>
> >>
> >> Regards,
> >> Dumitru
> >>
> > Thanks Dumitru for reviewing and testing it out.
> > Are you seeing high CPU only after applying this patch? In theory I
> > think this patch should not contribute to CPU spike.
> > Enabling ovn-monitor-all can result in higher CPU in ovn-controller in
> > circumstances when not all datapaths are local. In your test case, is
> > the topology ideal for ovn-monitor-all? I.e. does each node cares about
> > all datapaths? If the answer is yes, then could you try enabling
> > ovn-monitor-all only on half of the nodes, and see if the nodes with
> > ovn-monitor-all enabled are with higher CPU than others?
> >
>
> Hi Han,
>
> In my test topology all datapaths are local (i.e., all logical switches
> are connected to a single cluster logical router).
>
> The test machine I used initially was oversubscribed so I ran the tests
> again on a setup with more physical machines:
>
> 1. With OVN master, ovn-monitor-all=false, bringing up 300 nodes (300
> logical switches + one VIF per switch):
> - SB DB CPU usage is high after a certain number of nodes come up.
> Running perf on the setup points to ovsdb_monitor_get_update that takes
> up to 70% CPU time (including children). This due to each ovn-controller
> subscribing to OVSDB updates for all datapaths individually.
> - ovn-controller CPU usage is normal, i.e., no visible CPU spikes.
>
> 2. With OVN master, ovn-monitor-all=true, bringing up 300 nodes:
> - SB DB CPU usage is low, no visible CPU spikes.
> - ovn-controller CPU usage is normal as well.
>
> 3. With OVN master + your patches, ovn-monitor-all=true, bringing up 300
> nodes:
> - SB DB CPU usage is low, no visible CPU spikes.
> - ovn-controller CPU usage is normal as well.
>
> In conclusion all seems fine to me and even in the worst case scenario,
> when all datapaths are local, ovn-controller cpu usage is not affected
> by the extra datapath lookups introduced by your changes.
>
> Thanks,
> Dumitru
>
> > In addition, did you see any difference of CPU usage on SB DB?
> >
> > Thanks,
> > Han
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Hi Dumitru, thanks for the testing and sharing! I applied the patch to
master.

Patch
diff mbox series

diff --git a/controller/lflow.c b/controller/lflow.c
index 79d8913..3c10a16 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -792,12 +792,14 @@  put_load(const uint8_t *data, size_t len,
 
 static void
 consider_neighbor_flow(struct ovsdb_idl_index *sbrec_port_binding_by_name,
+                       const struct hmap *local_datapaths,
                        const struct sbrec_mac_binding *b,
                        struct ovn_desired_flow_table *flow_table)
 {
     const struct sbrec_port_binding *pb
         = lport_lookup_by_name(sbrec_port_binding_by_name, b->logical_port);
-    if (!pb) {
+    if (!pb || !get_local_datapath(local_datapaths,
+                                   pb->datapath->tunnel_key)) {
         return;
     }
 
@@ -869,11 +871,13 @@  consider_neighbor_flow(struct ovsdb_idl_index *sbrec_port_binding_by_name,
 static void
 add_neighbor_flows(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                    const struct sbrec_mac_binding_table *mac_binding_table,
+                   const struct hmap *local_datapaths,
                    struct ovn_desired_flow_table *flow_table)
 {
     const struct sbrec_mac_binding *b;
     SBREC_MAC_BINDING_TABLE_FOR_EACH (b, mac_binding_table) {
-        consider_neighbor_flow(sbrec_port_binding_by_name, b, flow_table);
+        consider_neighbor_flow(sbrec_port_binding_by_name, local_datapaths,
+                               b, flow_table);
     }
 }
 
@@ -882,6 +886,7 @@  void
 lflow_handle_changed_neighbors(
     struct ovsdb_idl_index *sbrec_port_binding_by_name,
     const struct sbrec_mac_binding_table *mac_binding_table,
+    const struct hmap *local_datapaths,
     struct ovn_desired_flow_table *flow_table)
 {
     const struct sbrec_mac_binding *mb;
@@ -904,7 +909,8 @@  lflow_handle_changed_neighbors(
             }
             VLOG_DBG("handle new mac_binding "UUID_FMT,
                      UUID_ARGS(&mb->header_.uuid));
-            consider_neighbor_flow(sbrec_port_binding_by_name, mb, flow_table);
+            consider_neighbor_flow(sbrec_port_binding_by_name, local_datapaths,
+                                   mb, flow_table);
         }
     }
 }
@@ -934,7 +940,8 @@  lflow_run(struct lflow_ctx_in *l_ctx_in, struct lflow_ctx_out *l_ctx_out)
 
     add_logical_flows(l_ctx_in, l_ctx_out);
     add_neighbor_flows(l_ctx_in->sbrec_port_binding_by_name,
-                       l_ctx_in->mac_binding_table, l_ctx_out->flow_table);
+                       l_ctx_in->mac_binding_table, l_ctx_in->local_datapaths,
+                       l_ctx_out->flow_table);
 }
 
 void
diff --git a/controller/lflow.h b/controller/lflow.h
index 8433cc0..f02f709 100644
--- a/controller/lflow.h
+++ b/controller/lflow.h
@@ -150,6 +150,7 @@  bool lflow_handle_changed_ref(enum ref_type, const char *ref_name,
 void lflow_handle_changed_neighbors(
     struct ovsdb_idl_index *sbrec_port_binding_by_name,
     const struct sbrec_mac_binding_table *,
+    const struct hmap *local_datapaths,
     struct ovn_desired_flow_table *);
 
 void lflow_destroy(void);
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 4d245ca..e7773b8 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -1504,11 +1504,15 @@  flow_output_sb_mac_binding_handler(struct engine_node *node, void *data)
         (struct sbrec_mac_binding_table *)EN_OVSDB_GET(
             engine_get_input("SB_mac_binding", node));
 
+    struct ed_type_runtime_data *rt_data =
+        engine_get_input_data("runtime_data", node);
+    const struct hmap *local_datapaths = &rt_data->local_datapaths;
+
     struct ed_type_flow_output *fo = data;
     struct ovn_desired_flow_table *flow_table = &fo->flow_table;
 
     lflow_handle_changed_neighbors(sbrec_port_binding_by_name,
-            mac_binding_table, flow_table);
+            mac_binding_table, local_datapaths, flow_table);
 
     engine_set_node_state(node, EN_UPDATED);
     return true;