Message ID | 1554338953-75998-1-git-send-email-hzhou8@ebay.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v3,1/2] ovn-controller: Fix busy loop when sb disconnected. | expand |
For this and patch 2: Acked-by: Mark Michelson <mmichels@redhat.com> On 4/3/19 8:49 PM, Han Zhou wrote: > From: Han Zhou <hzhou8@ebay.com> > > In the main loop, if the SB DB is disconnected when there is a pending > transaction, there can be busy loop causing 100% CPU of ovn-controller, > until SB DB is connected again. > > The root cause is that when a transaction is pending, ovsdb_idl_loop_run() > will return NULL for ovnsb_idl_txn, and chassis_run() returns NULL when > ovnsb_idl_txn is NULL, so the condition if (br_int && chassis) is not > satisfied and so ofctrl_run() is not executed in the main loop. If there > is any message pending from br-int.mgmt, such as OFPTYPE_BARRIER_REPLY or > OFPTYPE_ECHO_REQUEST, the main loop will be woken up again and again > because those messages are not processed because ofctrl_run() is not > invoked. > > This patch fixes the problem by moving ofctrl_run() above and run it > whenever br_int is not NULL, and not care about chassis because this > function doesn't depend on it. > > It also moves out sbrec_chassis_set_nb_cfg() from the "if (ovs_idl_txn)" > just to avoid adding more indentation of the whole block to avoid >79 > line length. > > Note: the changes of this patch is better to be shown with "-w" because > most of them are indent changes. > > Acked-by: Numan Siddique <nusiddiq@redhat.com> > Signed-off-by: Han Zhou <hzhou8@ebay.com> > --- > > Notes: > v2->v3: fix >79 line found by 0-day robot > > ovn/controller/ovn-controller.c | 97 ++++++++++++++++++++++------------------- > 1 file changed, 51 insertions(+), 46 deletions(-) > > diff --git a/ovn/controller/ovn-controller.c b/ovn/controller/ovn-controller.c > index 882cc19..be1e635 100644 > --- a/ovn/controller/ovn-controller.c > +++ b/ovn/controller/ovn-controller.c > @@ -721,38 +721,40 @@ main(int argc, char *argv[]) > &active_tunnels, &local_datapaths, > &local_lports, &local_lport_ids); > } > - if (br_int && chassis) { > - struct shash addr_sets = SHASH_INITIALIZER(&addr_sets); > - addr_sets_init(sbrec_address_set_table_get(ovnsb_idl_loop.idl), > - &addr_sets); > - struct shash port_groups = SHASH_INITIALIZER(&port_groups); > - port_groups_init( > - sbrec_port_group_table_get(ovnsb_idl_loop.idl), > - &port_groups); > - > - patch_run(ovs_idl_txn, > - ovsrec_bridge_table_get(ovs_idl_loop.idl), > - ovsrec_open_vswitch_table_get(ovs_idl_loop.idl), > - ovsrec_port_table_get(ovs_idl_loop.idl), > - sbrec_port_binding_table_get(ovnsb_idl_loop.idl), > - br_int, chassis); > > + if (br_int) { > enum mf_field_id mff_ovn_geneve = ofctrl_run( > br_int, &pending_ct_zones); > > - pinctrl_run(ovnsb_idl_txn, sbrec_chassis_by_name, > - sbrec_datapath_binding_by_key, > - sbrec_port_binding_by_datapath, > - sbrec_port_binding_by_key, > - sbrec_port_binding_by_name, > - sbrec_mac_binding_by_lport_ip, > - sbrec_dns_table_get(ovnsb_idl_loop.idl), > - br_int, chassis, > - &local_datapaths, &active_tunnels); > - update_ct_zones(&local_lports, &local_datapaths, &ct_zones, > - ct_zone_bitmap, &pending_ct_zones); > - if (ovs_idl_txn) { > - if (ofctrl_can_put()) { > + if (chassis) { > + struct shash addr_sets = SHASH_INITIALIZER(&addr_sets); > + addr_sets_init( > + sbrec_address_set_table_get(ovnsb_idl_loop.idl), > + &addr_sets); > + struct shash port_groups = SHASH_INITIALIZER(&port_groups); > + port_groups_init( > + sbrec_port_group_table_get(ovnsb_idl_loop.idl), > + &port_groups); > + > + patch_run(ovs_idl_txn, > + ovsrec_bridge_table_get(ovs_idl_loop.idl), > + ovsrec_open_vswitch_table_get(ovs_idl_loop.idl), > + ovsrec_port_table_get(ovs_idl_loop.idl), > + sbrec_port_binding_table_get(ovnsb_idl_loop.idl), > + br_int, chassis); > + > + pinctrl_run(ovnsb_idl_txn, sbrec_chassis_by_name, > + sbrec_datapath_binding_by_key, > + sbrec_port_binding_by_datapath, > + sbrec_port_binding_by_key, > + sbrec_port_binding_by_name, > + sbrec_mac_binding_by_lport_ip, > + sbrec_dns_table_get(ovnsb_idl_loop.idl), > + br_int, chassis, > + &local_datapaths, &active_tunnels); > + update_ct_zones(&local_lports, &local_datapaths, &ct_zones, > + ct_zone_bitmap, &pending_ct_zones); > + if (ovs_idl_txn && ofctrl_can_put()) { > stopwatch_start(CONTROLLER_LOOP_STOPWATCH_NAME, > time_msec()); > > @@ -809,28 +811,31 @@ main(int argc, char *argv[]) > sbrec_chassis_set_nb_cfg(chassis, cur_cfg); > } > } > - } > > - if (pending_pkt.conn) { > - char *error = ofctrl_inject_pkt(br_int, pending_pkt.flow_s, > - &addr_sets, &port_groups); > - if (error) { > - unixctl_command_reply_error(pending_pkt.conn, error); > - free(error); > - } else { > - unixctl_command_reply(pending_pkt.conn, NULL); > + if (pending_pkt.conn) { > + char *error = ofctrl_inject_pkt(br_int, > + pending_pkt.flow_s, > + &addr_sets, > + &port_groups); > + if (error) { > + unixctl_command_reply_error(pending_pkt.conn, > + error); > + free(error); > + } else { > + unixctl_command_reply(pending_pkt.conn, NULL); > + } > + pending_pkt.conn = NULL; > + free(pending_pkt.flow_s); > } > - pending_pkt.conn = NULL; > - free(pending_pkt.flow_s); > - } > > - update_sb_monitors(ovnsb_idl_loop.idl, chassis, > - &local_lports, &local_datapaths); > + update_sb_monitors(ovnsb_idl_loop.idl, chassis, > + &local_lports, &local_datapaths); > > - expr_const_sets_destroy(&addr_sets); > - shash_destroy(&addr_sets); > - expr_const_sets_destroy(&port_groups); > - shash_destroy(&port_groups); > + expr_const_sets_destroy(&addr_sets); > + shash_destroy(&addr_sets); > + expr_const_sets_destroy(&port_groups); > + shash_destroy(&port_groups); > + } > } > > /* If we haven't handled the pending packet insertion >
On Wed, Apr 03, 2019 at 05:49:12PM -0700, Han Zhou wrote: > From: Han Zhou <hzhou8@ebay.com> > > In the main loop, if the SB DB is disconnected when there is a pending > transaction, there can be busy loop causing 100% CPU of ovn-controller, > until SB DB is connected again. > > The root cause is that when a transaction is pending, ovsdb_idl_loop_run() > will return NULL for ovnsb_idl_txn, and chassis_run() returns NULL when > ovnsb_idl_txn is NULL, so the condition if (br_int && chassis) is not > satisfied and so ofctrl_run() is not executed in the main loop. If there > is any message pending from br-int.mgmt, such as OFPTYPE_BARRIER_REPLY or > OFPTYPE_ECHO_REQUEST, the main loop will be woken up again and again > because those messages are not processed because ofctrl_run() is not > invoked. > > This patch fixes the problem by moving ofctrl_run() above and run it > whenever br_int is not NULL, and not care about chassis because this > function doesn't depend on it. > > It also moves out sbrec_chassis_set_nb_cfg() from the "if (ovs_idl_txn)" > just to avoid adding more indentation of the whole block to avoid >79 > line length. > > Note: the changes of this patch is better to be shown with "-w" because > most of them are indent changes. > > Acked-by: Numan Siddique <nusiddiq@redhat.com> > Signed-off-by: Han Zhou <hzhou8@ebay.com> > --- > > Notes: > v2->v3: fix >79 line found by 0-day robot This has a patch reject against current master, so if it's still relevant would you mind rebasing and reposting? Thanks, Ben.
On Tue, Apr 16, 2019 at 10:43 AM Ben Pfaff <blp@ovn.org> wrote: > > On Wed, Apr 03, 2019 at 05:49:12PM -0700, Han Zhou wrote: > > From: Han Zhou <hzhou8@ebay.com> > > > > In the main loop, if the SB DB is disconnected when there is a pending > > transaction, there can be busy loop causing 100% CPU of ovn-controller, > > until SB DB is connected again. > > > > The root cause is that when a transaction is pending, ovsdb_idl_loop_run() > > will return NULL for ovnsb_idl_txn, and chassis_run() returns NULL when > > ovnsb_idl_txn is NULL, so the condition if (br_int && chassis) is not > > satisfied and so ofctrl_run() is not executed in the main loop. If there > > is any message pending from br-int.mgmt, such as OFPTYPE_BARRIER_REPLY or > > OFPTYPE_ECHO_REQUEST, the main loop will be woken up again and again > > because those messages are not processed because ofctrl_run() is not > > invoked. > > > > This patch fixes the problem by moving ofctrl_run() above and run it > > whenever br_int is not NULL, and not care about chassis because this > > function doesn't depend on it. > > > > It also moves out sbrec_chassis_set_nb_cfg() from the "if (ovs_idl_txn)" > > just to avoid adding more indentation of the whole block to avoid >79 > > line length. > > > > Note: the changes of this patch is better to be shown with "-w" because > > most of them are indent changes. > > > > Acked-by: Numan Siddique <nusiddiq@redhat.com> > > Signed-off-by: Han Zhou <hzhou8@ebay.com> > > --- > > > > Notes: > > v2->v3: fix >79 line found by 0-day robot > > This has a patch reject against current master, so if it's still > relevant would you mind rebasing and reposting? > Hi Ben, I just rebased and sent out v4. Thanks, Han
diff --git a/ovn/controller/ovn-controller.c b/ovn/controller/ovn-controller.c index 882cc19..be1e635 100644 --- a/ovn/controller/ovn-controller.c +++ b/ovn/controller/ovn-controller.c @@ -721,38 +721,40 @@ main(int argc, char *argv[]) &active_tunnels, &local_datapaths, &local_lports, &local_lport_ids); } - if (br_int && chassis) { - struct shash addr_sets = SHASH_INITIALIZER(&addr_sets); - addr_sets_init(sbrec_address_set_table_get(ovnsb_idl_loop.idl), - &addr_sets); - struct shash port_groups = SHASH_INITIALIZER(&port_groups); - port_groups_init( - sbrec_port_group_table_get(ovnsb_idl_loop.idl), - &port_groups); - - patch_run(ovs_idl_txn, - ovsrec_bridge_table_get(ovs_idl_loop.idl), - ovsrec_open_vswitch_table_get(ovs_idl_loop.idl), - ovsrec_port_table_get(ovs_idl_loop.idl), - sbrec_port_binding_table_get(ovnsb_idl_loop.idl), - br_int, chassis); + if (br_int) { enum mf_field_id mff_ovn_geneve = ofctrl_run( br_int, &pending_ct_zones); - pinctrl_run(ovnsb_idl_txn, sbrec_chassis_by_name, - sbrec_datapath_binding_by_key, - sbrec_port_binding_by_datapath, - sbrec_port_binding_by_key, - sbrec_port_binding_by_name, - sbrec_mac_binding_by_lport_ip, - sbrec_dns_table_get(ovnsb_idl_loop.idl), - br_int, chassis, - &local_datapaths, &active_tunnels); - update_ct_zones(&local_lports, &local_datapaths, &ct_zones, - ct_zone_bitmap, &pending_ct_zones); - if (ovs_idl_txn) { - if (ofctrl_can_put()) { + if (chassis) { + struct shash addr_sets = SHASH_INITIALIZER(&addr_sets); + addr_sets_init( + sbrec_address_set_table_get(ovnsb_idl_loop.idl), + &addr_sets); + struct shash port_groups = SHASH_INITIALIZER(&port_groups); + port_groups_init( + sbrec_port_group_table_get(ovnsb_idl_loop.idl), + &port_groups); + + patch_run(ovs_idl_txn, + ovsrec_bridge_table_get(ovs_idl_loop.idl), + ovsrec_open_vswitch_table_get(ovs_idl_loop.idl), + ovsrec_port_table_get(ovs_idl_loop.idl), + sbrec_port_binding_table_get(ovnsb_idl_loop.idl), + br_int, chassis); + + pinctrl_run(ovnsb_idl_txn, sbrec_chassis_by_name, + sbrec_datapath_binding_by_key, + sbrec_port_binding_by_datapath, + sbrec_port_binding_by_key, + sbrec_port_binding_by_name, + sbrec_mac_binding_by_lport_ip, + sbrec_dns_table_get(ovnsb_idl_loop.idl), + br_int, chassis, + &local_datapaths, &active_tunnels); + update_ct_zones(&local_lports, &local_datapaths, &ct_zones, + ct_zone_bitmap, &pending_ct_zones); + if (ovs_idl_txn && ofctrl_can_put()) { stopwatch_start(CONTROLLER_LOOP_STOPWATCH_NAME, time_msec()); @@ -809,28 +811,31 @@ main(int argc, char *argv[]) sbrec_chassis_set_nb_cfg(chassis, cur_cfg); } } - } - if (pending_pkt.conn) { - char *error = ofctrl_inject_pkt(br_int, pending_pkt.flow_s, - &addr_sets, &port_groups); - if (error) { - unixctl_command_reply_error(pending_pkt.conn, error); - free(error); - } else { - unixctl_command_reply(pending_pkt.conn, NULL); + if (pending_pkt.conn) { + char *error = ofctrl_inject_pkt(br_int, + pending_pkt.flow_s, + &addr_sets, + &port_groups); + if (error) { + unixctl_command_reply_error(pending_pkt.conn, + error); + free(error); + } else { + unixctl_command_reply(pending_pkt.conn, NULL); + } + pending_pkt.conn = NULL; + free(pending_pkt.flow_s); } - pending_pkt.conn = NULL; - free(pending_pkt.flow_s); - } - update_sb_monitors(ovnsb_idl_loop.idl, chassis, - &local_lports, &local_datapaths); + update_sb_monitors(ovnsb_idl_loop.idl, chassis, + &local_lports, &local_datapaths); - expr_const_sets_destroy(&addr_sets); - shash_destroy(&addr_sets); - expr_const_sets_destroy(&port_groups); - shash_destroy(&port_groups); + expr_const_sets_destroy(&addr_sets); + shash_destroy(&addr_sets); + expr_const_sets_destroy(&port_groups); + shash_destroy(&port_groups); + } } /* If we haven't handled the pending packet insertion