Message ID | 1475037023-8619-2-git-send-email-bschanmu@redhat.com |
---|---|
State | Superseded |
Headers | show |
Babu, Thank you for working on this. At a high level, it is not clear to me the boundary between ocf scripts and the ovn-ctl script -- i.e. which aspect is managed by which entity. For example, 1) which scripts are responsible for starting the ovsdb servers. 2) Which script should manage the fail-over -- I tried to shut down a cluster node using the "pcs" command, and fail-over did not happen. May be adding some higher level description of the intended usage would help, at least to someone who is not that familiar with Linux HA, like me. On Tue, Sep 27, 2016 at 9:30 PM, <bschanmu@redhat.com> wrote: > From: Babu Shanmugam <bschanmu@redhat.com> > > This patch adds support to start_ovsdb() function in ovn-ctl to start the > ovn db servers in backup mode. This can be done in the following ways > 1. Use parameters --ovn-nb-sync-from-addr and --ovn-sb-sync-from-addr to > set the addresses of the active server. > 2. Create files $etcdir/ovnnb-active.conf and $etcdir/ovnsb-active.conf > with the tcp url of the active servers. > > Additional functions to promote a backup server to active and demote > active server to backup mode are also added in this patch > > One can optionally set the DB parameters for northd in > $etcdir/ovn-northd-db-params.conf. For example, > > --ovnnb-db=tcp:172.16.247.230:6641 --ovnsb-db=tcp:172.16.247.230:6642 > > The parameters will be used as is, by start_northd(). If this file exists, > start_northd() will not start the DB servers even if $OVN_MANAGE_OVSDB is > 'yes'. > Would you please expend above sentence? Why it behave this way? > > Signed-off-by: Babu Shanmugam <bschanmu@redhat.com> > --- > ovn/utilities/ovn-ctl | 173 ++++++++++++++++++++++++++++++ > ++++++++++++-------- > 1 file changed, 148 insertions(+), 25 deletions(-) > > diff --git a/ovn/utilities/ovn-ctl b/ovn/utilities/ovn-ctl > index 07bff8a..1c1687f 100755 > --- a/ovn/utilities/ovn-ctl > +++ b/ovn/utilities/ovn-ctl > @@ -26,6 +26,9 @@ for dir in "$sbindir" "$bindir" /sbin /bin /usr/sbin > /usr/bin; do > done > > > +ovnnb_active_conf_file="$etcdir/ovnnb-active.conf" > +ovnsb_active_conf_file="$etcdir/ovnsb-active.conf" > +ovn_northd_db_conf_file="$etcdir/ovn-northd-db-params.conf" > ## ----- ## > ## start ## > ## ----- ## > @@ -45,6 +48,44 @@ stop_ovsdb () { > fi > } > > +demote_ovnnb() { > + if test ! -z "$DB_NB_SYNC_FROM_ADDR"; then > + echo "tcp:$DB_NB_SYNC_FROM_ADDR:$DB_NB_SYNC_FROM_PORT" > > $ovnnb_active_conf_file > + fi > + > + if test -e $ovnnb_active_conf_file; then > + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/set-active-ovsdb-server > `cat $ovnnb_active_conf_file` > + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/connect-active- > ovsdb-server > + else > + echo >&2 "$0: active server details not set" > + exit 1 > + fi > +} > + > +demote_ovnsb() { > + if test ! -z "$DB_SB_SYNC_FROM_ADDR"; then > + echo "tcp:$DB_SB_SYNC_FROM_ADDR:$DB_SB_SYNC_FROM_PORT" > > $ovnsb_active_conf_file > + fi > + > + if test -e $ovnsb_active_conf_file; then > + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/set-active-ovsdb-server > `cat $ovnsb_active_conf_file` > + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/connect-active- > ovsdb-server > + else > + echo >&2 "$0: active server details not set" > + exit 1 > + fi > +} > + > +promote_ovnnb() { > + rm -f $ovnnb_active_conf_file > + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/disconnect- > active-ovsdb-server > +} > + > +promote_ovnsb() { > + rm -f $ovnsb_active_conf_file > + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/disconnect- > active-ovsdb-server > +} > + > start_ovsdb () { > # Check and eventually start ovsdb-server for Northbound DB > if ! pidfile_is_running $DB_NB_PID; then > @@ -52,7 +93,20 @@ start_ovsdb () { > > set ovsdb-server > > - set "$@" --detach --monitor $OVN_NB_LOG > --log-file=$OVN_NB_LOGFILE --remote=punix:$DB_NB_SOCK > --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR --pidfile=$DB_NB_PID > --unixctl=ovnnb_db.ctl > + set "$@" --detach --monitor $OVN_NB_LOG \ > + --log-file=$OVN_NB_LOGFILE \ > + --remote=punix:$DB_NB_SOCK \ > + --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR \ > + --pidfile=$DB_NB_PID \ > + --unixctl=ovnnb_db.ctl > + > + if test ! -z "$DB_NB_SYNC_FROM_ADDR"; then > + echo "tcp:$DB_NB_SYNC_FROM_ADDR:$DB_NB_SYNC_FROM_PORT" > > $ovnnb_active_conf_file > + fi > + > + if test -e $ovnnb_active_conf_file; then > + set "$@" --sync-from=`cat $ovnnb_active_conf_file` > + fi > > $@ $DB_NB_FILE > fi > @@ -63,11 +117,45 @@ start_ovsdb () { > > set ovsdb-server > > - set "$@" --detach --monitor $OVN_SB_LOG > --log-file=$OVN_SB_LOGFILE --remote=punix:$DB_SB_SOCK > --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR --pidfile=$DB_SB_PID > --unixctl=ovnsb_db.ctl > + set "$@" --detach --monitor $OVN_SB_LOG \ > + --log-file=$OVN_SB_LOGFILE \ > + --remote=punix:$DB_SB_SOCK \ > + --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR \ > + --pidfile=$DB_SB_PID \ > + --unixctl=ovnsb_db.ctl > + > + if test ! -z "$DB_SB_SYNC_FROM_ADDR"; then > + echo "tcp:$DB_SB_SYNC_FROM_ADDR:$DB_SB_SYNC_FROM_PORT" > > $ovnsb_active_conf_file > + fi > + > + if test -e $ovnsb_active_conf_file; then > + set "$@" --sync-from=`cat $ovnsb_active_conf_file` > + fi > + > $@ $DB_SB_FILE > fi > } > > +sync_status() { > + ovs-appctl -t $rundir/ovn${1}_db.ctl ovsdb-server/sync-status | awk > '{if(NR==1) print $2}' > +} > + > +status_ovnnb() { > + if ! pidfile_is_running $DB_NB_PID; then > + echo "not-running" > + else > + echo "running/$(sync_status nb)" > + fi > +} > + > +status_ovnsb() { > + if ! pidfile_is_running $DB_SB_PID; then > + echo "not-running" > + else > + echo "running/$(sync_status sb)" > + fi > +} > + > status_ovsdb () { > if ! pidfile_is_running $DB_NB_PID; then > log_success_msg "OVN Northbound DB is not running" > @@ -83,29 +171,36 @@ status_ovsdb () { > } > > start_northd () { > - if test X"$OVN_MANAGE_OVSDB" = Xyes; then > - start_ovsdb > - fi > + if [ ! -e $ovn_northd_db_conf_file ]; then > + if test X"$OVN_MANAGE_OVSDB" = Xyes; then > + start_ovsdb > + fi > + > + if ! pidfile_is_running $DB_NB_PID; then > + log_failure_msg "OVN Northbound DB is not running" > + exit > + fi > + if ! pidfile_is_running $DB_SB_PID; then > + log_failure_msg "OVN Southbound DB is not running" > + exit > + fi > + ovn_northd_params="--ovnnb-db=unix:$DB_NB_SOCK > --ovnsb-db=unix:$DB_SB_SOCK" > + else > + ovn_northd_params="`cat $ovn_northd_db_conf_file`" > + fi > > - if ! pidfile_is_running $DB_NB_PID; then > - log_failure_msg "OVN Northbound DB is not running" > - exit > - fi > - if ! pidfile_is_running $DB_SB_PID; then > - log_failure_msg "OVN Southbound DB is not running" > - exit > - fi > + if daemon_is_running ovn-northd; then > + log_success_msg "ovn-northd is already running" > + else > + set ovn-northd > + if test X"$OVN_NORTHD_LOGFILE" != X; then > + set "$@" --log-file=$OVN_NORTHD_LOGFILE > + fi > > - if daemon_is_running ovn-northd; then > - log_success_msg "ovn-northd is already running" > - else > - set ovn-northd > - if test X"$OVN_NORTHD_LOGFILE" != X; then > - set "$@" --log-file=$OVN_NORTHD_LOGFILE > - fi > - set "$@" $OVN_NORTHD_LOG --ovnnb-db=unix:$DB_NB_SOCK > --ovnsb-db=unix:$DB_SB_SOCK > - OVS_RUNDIR=${OVN_RUNDIR} start_daemon "$OVN_NORTHD_PRIORITY" > "$OVN_NORTHD_WRAPPER" "$@" > - fi > + set "$@" $OVN_NORTHD_LOG $ovn_northd_params > + > + OVS_RUNDIR=${OVN_RUNDIR} start_daemon "$OVN_NORTHD_PRIORITY" > "$OVN_NORTHD_WRAPPER" "$@" > + fi > } > > start_controller () { > @@ -127,8 +222,10 @@ start_controller_vtep () { > stop_northd () { > OVS_RUNDIR=${OVN_RUNDIR} stop_daemon ovn-northd > > - if test X"$OVN_MANAGE_OVSDB" = Xyes; then > - stop_ovsdb > + if [ ! -e $ovn_northd_db_conf_file ]; then > + if test X"$OVN_MANAGE_OVSDB" = Xyes; then > + stop_ovsdb > + fi > fi > } > > @@ -176,12 +273,16 @@ set_defaults () { > DB_NB_FILE=$dbdir/ovnnb_db.db > DB_NB_ADDR=0.0.0.0 > DB_NB_PORT=6641 > + DB_NB_SYNC_FROM_ADDR= > + DB_NB_SYNC_FROM_PORT=6641 > > DB_SB_SOCK=$rundir/ovnsb_db.sock > DB_SB_PID=$rundir/ovnsb_db.pid > DB_SB_FILE=$dbdir/ovnsb_db.db > DB_SB_ADDR=0.0.0.0 > DB_SB_PORT=6642 > + DB_SB_SYNC_FROM_ADDR= > + DB_SB_SYNC_FROM_PORT=6642 > > DB_NB_SCHEMA=$datadir/ovn-nb.ovsschema > DB_SB_SCHEMA=$datadir/ovn-sb.ovsschema > @@ -272,6 +373,10 @@ File location options: > --db-sb-port=PORT OVN Southbound db ptcp port (default: $DB_SB_PORT) > --ovn-nb-logfile=FILE OVN Northbound log file (default: $OVN_NB_LOGFILE) > --ovn-sb-logfile=FILE OVN Southbound log file (default: $OVN_SB_LOGFILE) > + --db-nb-sync-from-addr=ADDR OVN Northbound active db tcp address > (default: $DB_NB_SYNC_FROM_ADDR) > + --db-nb-sync-from-port=PORT OVN Northdbound active db tcp port > (default: $DB_NB_SYNC_FROM_PORT) > + --db-sb-sync-from-addr=ADDR OVN Southbound active db tcp address > (default: $DB_SB_SYNC_FROM_ADDR) > + --db-sb-sync-from-port=ADDR OVN Southbound active db tcp port (default: > $DB_SB_SYNC_FROM_PORT) > > Default directories with "configure" option and environment variable > override: > logs: /usr/local/var/log/openvswitch (--with-logdir, OVS_LOGDIR) > @@ -377,6 +482,24 @@ case $command in > status_controller_vtep) > daemon_status ovn-controller-vtep || exit 1 > ;; > + promote_ovnnb) > + promote_ovnnb > + ;; > + promote_ovnsb) > + promote_ovnsb > + ;; > + demote_ovnnb) > + demote_ovnnb > + ;; > + demote_ovnsb) > + demote_ovnsb > + ;; > + status_ovnnb) > + status_ovnnb > + ;; > + status_ovnsb) > + status_ovnsb > + ;; > help) > usage > ;; > The newly added commands are not documented in the help, nor are they documented in the man page. > -- > 1.9.1 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev >
On Friday 07 October 2016 05:33 AM, Andy Zhou wrote: > Babu, Thank you for working on this. At a high level, it is not > clear to me the boundary between ocf scripts and the ovn-ctl script -- > i.e. which aspect is managed by which entity. For example, > 1) which scripts are responsible for starting the ovsdb servers. ovsdb servers are started by the pacemaker. It uses the OCF script and the OCF script uses ovn-ctl. > 2) Which script should manage the fail-over -- I tried to shut down a > cluster node using the "pcs" command, and fail-over did not happen. The OCF script for OVN DB servers is capable of understanding the promote and demote calls. So, pacemaker will use this script to run ovsdb server in all the nodes and promote one node as the master(active server). If the node in which the master instance is running fails, pacemaker automatically promotes another node as the master. OCF script is an agent for the pacemaker for the OVN db resource. The above behavior depends on the way you are configuring the resource that uses this OCF script. I am attaching a simple set of commands to configure the ovsdb server. You can create the resources after creating the cluster with the following command crm configure < ovndb.pcmk Please note, you have to replace the macros VM1_NAME, VM2_NAME, VM3_NAME and MASTER_IP with the respective values before using ovndb.pcmk. This script works with a 3 node cluster. I am assuming the node ids as 101, 102, and 103. Please replace them as well to work with your cluster. -- Babu node $id="101" VM1_NAME node $id="102" VM2_NAME node $id="103" VM3_NAME primitive ovndb_servers ocf:ovn:ovndb-servers params master_ip="MASTER_IP" op start interval="0s" timeout="30s" op stop interval="0s" timeout="20s" op promote interval="0s" timeout="50s" op demote interval="0s" timeout="50s" op monitor interval="10s" timeout="20s" primitive ovnip ocf:heartbeat:IPaddr2 params ip="MASTER_IP" cidr_netmask="24" op start interval="0s" timeout="20s" op stop interval="0s" timeout="20s" op monitor interval="10s" timeout="20s" ms ovndb_servers-master ovndb_servers meta notify="true" colocation colocation-ovndb_servers-master-ovnip-INFINITY inf: ovndb_servers-master:Started ovnip:Master order order-ovnip-ovndb_servers-master-mandatory inf: ovnip:start ovndb_servers-master:start
On Sun, Oct 9, 2016 at 12:02 AM, Babu Shanmugam <bschanmu@redhat.com> wrote: > > > On Friday 07 October 2016 05:33 AM, Andy Zhou wrote: > >> Babu, Thank you for working on this. At a high level, it is not clear >> to me the boundary between ocf scripts and the ovn-ctl script -- i.e. which >> aspect is managed by which entity. For example, 1) which scripts are >> responsible for starting the ovsdb servers. >> > ovsdb servers are started by the pacemaker. It uses the OCF script and the > OCF script uses ovn-ctl. > > 2) Which script should manage the fail-over -- I tried to shut down a >> cluster node using the "pcs" command, and fail-over did not happen. >> > The OCF script for OVN DB servers is capable of understanding the promote > and demote calls. So, pacemaker will use this script to run ovsdb server in > all the nodes and promote one node as the master(active server). If the > node in which the master instance is running fails, pacemaker automatically > promotes another node as the master. OCF script is an agent for the > pacemaker for the OVN db resource. > The above behavior depends on the way you are configuring the resource > that uses this OCF script. I am attaching a simple set of commands to > configure the ovsdb server. You can create the resources after creating the > cluster with the following command > > crm configure < ovndb.pcmk > > Please note, you have to replace the macros VM1_NAME, VM2_NAME, VM3_NAME > and MASTER_IP with the respective values before using ovndb.pcmk. This > script works with a 3 node cluster. I am assuming the node ids as 101, 102, > and 103. Please replace them as well to work with your cluster. > > > -- > Babu > Unfortunately, CRM is not distributed with pacemaker on centos anymore. It took me some time to get it installed. I think other may ran into similar issues, so it may be worth while do document this, or change the script to use "pcs" which is part of the distribution. I adapted the script with my setup. I have two nodes, "h1"(10.33.74.77) and "h2"(10.33.75.158), For Master_IP, I used 10.33.75.220. This is the output of crm configure show: ------ [root@h2 azhou]# crm configure show node 1: h1 \ attributes node 2: h2 primitive ClusterIP IPaddr2 \ params ip=10.33.75.200 cidr_netmask=32 \ op start interval=0s timeout=20s \ op stop interval=0s timeout=20s \ op monitor interval=30s primitive WebSite apache \ params configfile="/etc/httpd/conf/httpd.conf" statusurl=" http://127.0.0.1/server-status" \ op start interval=0s timeout=40s \ op stop interval=0s timeout=60s \ op monitor interval=1min \ meta primitive ovndb ocf:ovn:ovndb-servers \ op start interval=0s timeout=30s \ op stop interval=0s timeout=20s \ op promote interval=0s timeout=50s \ op demote interval=0s timeout=50s \ op monitor interval=1min \ meta colocation colocation-WebSite-ClusterIP-INFINITY inf: WebSite ClusterIP order order-ClusterIP-WebSite-mandatory ClusterIP:start WebSite:start property cib-bootstrap-options: \ have-watchdog=false \ dc-version=1.1.13-10.el7_2.4-44eb2dd \ cluster-infrastructure=corosync \ cluster-name=mycluster \ stonith-enabled=false -------- I have also added firewall rules to allow access to TCP port 6642 and port 6641. At this stage, crm_mon shows: Last updated: Wed Oct 12 14:49:07 2016 Last change: Wed Oct 12 13:58:55 2016 by root via crm_attribute on h2 Stack: corosync Current DC: h2 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum 2 nodes and 3 resources configured Online: [ h1 h2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started h2 WebSite (ocf::heartbeat:apache): Started h2 ovndb (ocf::ovn:ovndb-servers): Started h1 Failed Actions: * ovndb_start_0 on h2 'unknown error' (1): call=39, status=Timed Out, exitreason ='none', last-rc-change='Wed Oct 12 14:43:03 2016', queued=0ms, exec=30003ms --- Not sure what the error message on h2 is about, Notice ovndb service is now running on h1, while the cluster IP is on h2. Also, both server are running as a backup server: [root@h1 azhou]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl ovsdb-server/sync-status state: backup connecting: tcp:192.0.2.254:6642 // I specified the IP at /etc/openvswitch/ovnsb-active.conf, But the file was over-written with 192.0.2.254 [root@h2 ovs]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl ovsdb-server/sync-status state: backup replicating: tcp:10.33.74.77:6642 // The IP address was retained on h2 database: OVN_Southbound --- Any suggestions on what I did wrong?
On Thursday 13 October 2016 07:26 AM, Andy Zhou wrote: > > > On Sun, Oct 9, 2016 at 12:02 AM, Babu Shanmugam <bschanmu@redhat.com > <mailto:bschanmu@redhat.com>> wrote: > > > > On Friday 07 October 2016 05:33 AM, Andy Zhou wrote: > > Babu, Thank you for working on this. At a high level, it is > not clear to me the boundary between ocf scripts and the > ovn-ctl script -- i.e. which aspect is managed by which > entity. For example, 1) which scripts are responsible for > starting the ovsdb servers. > > ovsdb servers are started by the pacemaker. It uses the OCF script > and the OCF script uses ovn-ctl. > > 2) Which script should manage the fail-over -- I tried to shut > down a cluster node using the "pcs" command, and fail-over did > not happen. > > The OCF script for OVN DB servers is capable of understanding the > promote and demote calls. So, pacemaker will use this script to > run ovsdb server in all the nodes and promote one node as the > master(active server). If the node in which the master instance is > running fails, pacemaker automatically promotes another node as > the master. OCF script is an agent for the pacemaker for the OVN > db resource. > The above behavior depends on the way you are configuring the > resource that uses this OCF script. I am attaching a simple set of > commands to configure the ovsdb server. You can create the > resources after creating the cluster with the following command > > crm configure < ovndb.pcmk > > Please note, you have to replace the macros VM1_NAME, VM2_NAME, > VM3_NAME and MASTER_IP with the respective values before using > ovndb.pcmk. This script works with a 3 node cluster. I am assuming > the node ids as 101, 102, and 103. Please replace them as well to > work with your cluster. > > > -- > Babu > > > Unfortunately, CRM is not distributed with pacemaker on centos > anymore. It took me some time to get it installed. I think other may > ran into similar issues, so > it may be worth while do document this, or change the script to use > "pcs" which is part of the distribution. > I agree. Is INSTALL*.md good enough? In openstack, we are managing the resource through puppet manifests. > > I adapted the script with my setup. I have two nodes, > "h1"(10.33.74.77) and "h2"(10.33.75.158), For Master_IP, I used > 10.33.75.220. > > This is the output of crm configure show: > > ------ > > [root@h2 azhou]# crm configure show > > node1: h1 \ > > attributes > > node2: h2 > > primitiveClusterIP IPaddr2 \ > > paramsip=10.33.75.200cidr_netmask=32\ > > opstart interval=0stimeout=20s\ > > opstop interval=0stimeout=20s\ > > opmonitor interval=30s > > primitiveWebSite apache \ > > paramsconfigfile="/etc/httpd/conf/httpd.conf"statusurl="http://127.0.0.1/server-status"\ > > opstart interval=0stimeout=40s\ > > opstop interval=0stimeout=60s\ > > opmonitor interval=1min\ > > meta > > primitiveovndb ocf:ovn:ovndb-servers \ > > opstart interval=0stimeout=30s\ > > opstop interval=0stimeout=20s\ > > oppromote interval=0stimeout=50s\ > > opdemote interval=0stimeout=50s\ > > opmonitor interval=1min\ > > meta > > colocationcolocation-WebSite-ClusterIP-INFINITY inf: WebSiteClusterIP > > orderorder-ClusterIP-WebSite-mandatory ClusterIP:start WebSite:start > > propertycib-bootstrap-options: \ > > have-watchdog=false\ > > dc-version=1.1.13-10.el7_2.4-44eb2dd\ > > cluster-infrastructure=corosync\ > > cluster-name=mycluster\ > > stonith-enabled=false > > You seem to have configured ovndb just as a primitive resource and not as a master slave resource. And there is no colocation resource configured for the ovndb with ClusterIP. Only with the colocation resource, ovndb server will be co-located with the ClusterIP resource. You will have to include the following lines for crm configure. You can configure the same with pcs as well. ms ovndb-master ovndb meta notify="true" colocation colocation-ovndb-master-ClusterIP-INFINITY inf: ovndb-master:Started ClusterIP:Master order order-ClusterIP-ovndb-master-mandatory inf: ClusterIP:start ovndb-master:start > -------- > > I have also added firewall rules to allow access to TCP port 6642 and > port 6641. > > > At this stage, crm_mon shows: > > Last updated: Wed Oct 12 14:49:07 2016 Last change: Wed Oct > 12 13:58:55 > > 2016 by root via crm_attributeon h2 > > Stack: corosync > > Current DC: h2 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum > > 2 nodes and 3 resources configured > > > Online: [ h1 h2 ] > > > ClusterIP(ocf::heartbeat:IPaddr2):Started h2 > > WebSite (ocf::heartbeat:apache): Started h2 > > ovndb (ocf::ovn:ovndb-servers):Started h1 > > > Failed Actions: > > * ovndb_start_0 on h2 'unknown error' (1): call=39, status=Timed Out, > exitreason > > ='none', > > last-rc-change='Wed Oct 12 14:43:03 2016', queued=0ms, exec=30003ms > > > --- > > Not sure what the error message on h2 is about, Notice ovndb service > is now running on h1, while the cluster IP is on h2. > Looks like, the OCF script is not able to start the ovsdb servers in 'h2' node (we are getting a timed-out status). You can check if the OCF script is working good by using ocf-tester. You can run the ocf-tester using ocf-tester -n test-ovndb -o master_ip 10.0.0.1 <path-to-the-ocf-script> Alternately, you can check if the ovsdb servers are started properly by running /usr/share/openvswitch/scripts/ovn-ctl --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb > Also, both server are running as a backup server: > > [root@h1 azhou]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl > ovsdb-server/sync-status > > state: backup > > connecting: tcp:192.0.2.254:6642 <http://192.0.2.254:6642> // I > specified the IP at /etc/openvswitch/ovnsb-active.conf, But the file > was over-written with 192.0.2.254 > > > [root@h2 ovs]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl > ovsdb-server/sync-status > > state: backup > > replicating: tcp:10.33.74.77:6642 <http://10.33.74.77:6642> // The > IP address was retained on h2 > > database: OVN_Southbound > > --- > > Any suggestions on what I did wrong? > > I think this is mostly due to the crm configuration. Once you add the 'ms' and 'colocation' resources, you should be able to overcome this problem. I have never tried colocating two resources with the ClusterIP resource. Just for testing, is it possible to drop the WebServer resource? Thank you, Babu
On Wed, Oct 12, 2016 at 10:57 PM, Babu Shanmugam <bschanmu@redhat.com> wrote: > > > On Thursday 13 October 2016 07:26 AM, Andy Zhou wrote: > > > > On Sun, Oct 9, 2016 at 12:02 AM, Babu Shanmugam <bschanmu@redhat.com> > wrote: > >> >> >> On Friday 07 October 2016 05:33 AM, Andy Zhou wrote: >> >>> Babu, Thank you for working on this. At a high level, it is not clear >>> to me the boundary between ocf scripts and the ovn-ctl script -- i.e. which >>> aspect is managed by which entity. For example, 1) which scripts are >>> responsible for starting the ovsdb servers. >>> >> ovsdb servers are started by the pacemaker. It uses the OCF script and >> the OCF script uses ovn-ctl. >> >> 2) Which script should manage the fail-over -- I tried to shut down a >>> cluster node using the "pcs" command, and fail-over did not happen. >>> >> The OCF script for OVN DB servers is capable of understanding the promote >> and demote calls. So, pacemaker will use this script to run ovsdb server in >> all the nodes and promote one node as the master(active server). If the >> node in which the master instance is running fails, pacemaker automatically >> promotes another node as the master. OCF script is an agent for the >> pacemaker for the OVN db resource. >> The above behavior depends on the way you are configuring the resource >> that uses this OCF script. I am attaching a simple set of commands to >> configure the ovsdb server. You can create the resources after creating the >> cluster with the following command >> >> crm configure < ovndb.pcmk >> >> Please note, you have to replace the macros VM1_NAME, VM2_NAME, VM3_NAME >> and MASTER_IP with the respective values before using ovndb.pcmk. This >> script works with a 3 node cluster. I am assuming the node ids as 101, 102, >> and 103. Please replace them as well to work with your cluster. >> >> >> -- >> Babu >> > > Unfortunately, CRM is not distributed with pacemaker on centos anymore. > It took me some time to get it installed. I think other may ran into > similar issues, so > it may be worth while do document this, or change the script to use "pcs" > which is part of the distribution. > > > I agree. Is INSTALL*.md good enough? In openstack, we are managing the > resource through puppet manifests. > O.K. > > > > I adapted the script with my setup. I have two nodes, "h1"(10.33.74.77) > and "h2"(10.33.75.158), For Master_IP, I used 10.33.75.220. > > This is the output of crm configure show: > > ------ > > [root@h2 azhou]# crm configure show > > node 1: h1 \ > > attributes > > node 2: h2 > > primitive ClusterIP IPaddr2 \ > > params ip=10.33.75.200 cidr_netmask=32 \ > > op start interval=0s timeout=20s \ > > op stop interval=0s timeout=20s \ > > op monitor interval=30s > > primitive WebSite apache \ > > params configfile="/etc/httpd/conf/httpd.conf" statusurl=" > http://127.0.0.1/server-status" \ > > op start interval=0s timeout=40s \ > > op stop interval=0s timeout=60s \ > > op monitor interval=1min \ > > meta > > primitive ovndb ocf:ovn:ovndb-servers \ > > op start interval=0s timeout=30s \ > > op stop interval=0s timeout=20s \ > > op promote interval=0s timeout=50s \ > > op demote interval=0s timeout=50s \ > > op monitor interval=1min \ > > meta > > colocation colocation-WebSite-ClusterIP-INFINITY inf: WebSite ClusterIP > > order order-ClusterIP-WebSite-mandatory ClusterIP:start WebSite:start > > property cib-bootstrap-options: \ > > have-watchdog=false \ > > dc-version=1.1.13-10.el7_2.4-44eb2dd \ > > cluster-infrastructure=corosync \ > > cluster-name=mycluster \ > > stonith-enabled=false > > > > You seem to have configured ovndb just as a primitive resource and not as > a master slave resource. And there is no colocation resource configured for > the ovndb with ClusterIP. Only with the colocation resource, ovndb server > will be co-located with the ClusterIP resource. You will have to include > the following lines for crm configure. You can configure the same with pcs > as well. > > ms ovndb-master ovndb meta notify="true" > colocation colocation-ovndb-master-ClusterIP-INFINITY inf: > ovndb-master:Started ClusterIP:Master > order order-ClusterIP-ovndb-master-mandatory inf: ClusterIP:start > ovndb-master:start > > Done. Now it shows the following. [root@h2 ovs]# crm configure show > > node 1: h1 \ > > attributes > > node 2: h2 > > primitive ClusterIP IPaddr2 \ > > params ip=10.33.75.200 cidr_netmask=32 \ > > op start interval=0s timeout=20s \ > > op stop interval=0s timeout=20s \ > > op monitor interval=30s > > primitive ovndb ocf:ovn:ovndb-servers \ > > op start interval=0s timeout=30s \ > > op stop interval=0s timeout=20s \ > > op promote interval=0s timeout=50s \ > > op demote interval=0s timeout=50s \ > > op monitor interval=1min \ > > meta > > ms ovndb-master ovndb \ > > meta notify=true > > colocation colocation-ovndb-master-ClusterIP-INFINITY inf: ovndb-master:Started > ClusterIP:Master > > order order-ClusterIP-ovndb-master-mandatory inf: ClusterIP:start > ovndb-master:start > > property cib-bootstrap-options: \ > > have-watchdog=false \ > > dc-version=1.1.13-10.el7_2.4-44eb2dd \ > > cluster-infrastructure=corosync \ > > cluster-name=mycluster \ > > stonith-enabled=false > > property ovn_ovsdb_master_server: \ > > OVN_REPL_INFO=h1 > > > -------- > > I have also added firewall rules to allow access to TCP port 6642 and port > 6641. > > > At this stage, crm_mon shows: > > Last updated: Wed Oct 12 14:49:07 2016 Last change: Wed Oct 12 > 13:58:55 > > 2016 by root via crm_attribute on h2 > > Stack: corosync > > Current DC: h2 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum > > 2 nodes and 3 resources configured > > > Online: [ h1 h2 ] > > > ClusterIP (ocf::heartbeat:IPaddr2): Started h2 > > WebSite (ocf::heartbeat:apache): Started h2 > > ovndb (ocf::ovn:ovndb-servers): Started h1 > > > Failed Actions: > > * ovndb_start_0 on h2 'unknown error' (1): call=39, status=Timed Out, > exitreason > > ='none', > > last-rc-change='Wed Oct 12 14:43:03 2016', queued=0ms, exec=30003ms > > > --- > > Not sure what the error message on h2 is about, Notice ovndb service is > now running on h1, while the cluster IP is on h2. > > > Looks like, the OCF script is not able to start the ovsdb servers in 'h2' > node (we are getting a timed-out status). You can check if the OCF script > is working good by using ocf-tester. You can run the ocf-tester using > > ocf-tester -n test-ovndb -o master_ip 10.0.0.1 <path-to-the-ocf-script> > My installation does not have ocf-tester, There is a program called ocft with a test option. Not sure if this is a suitable replacement. If not, how could I get the ocf-tester program? I ran the ocft program and get the following output. Not sure what it means. [root@h2 ovs]# ocft test -n test-ovndb -o master_ip 10.0.0.1 /usr/share/openvswitch/scripts/ovndb-servers.ocf ERROR: cases directory not found. Alternately, you can check if the ovsdb servers are started properly by > running > > /usr/share/openvswitch/scripts/ovn-ctl --db-sb-sync-from=10.0.0.1 > --db-nb-sync-from=10.0.0.1 start_ovsdb > > > The output are as follows. Should we use --db-sb-sync-from-addr instead? [root@h2 ovs]# /usr/share/openvswitch/scripts/ovn-ctl --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb /usr/share/openvswitch/scripts/ovn-ctl: unknown option "--db-sb-sync-from=10.0.0.1" (use --help for help) /usr/share/openvswitch/scripts/ovn-ctl: unknown option "--db-nb-sync-from=10.0.0.1" (use --help for help) 'ovn-ctl' runs without any error message after I fixed the command line parameter. > Also, both server are running as a backup server: > > [root@h1 azhou]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl > ovsdb-server/sync-status > > state: backup > > connecting: tcp:192.0.2.254:6642 // I specified the IP at > /etc/openvswitch/ovnsb-active.conf, But the file was over-written with > 192.0.2.254 > > > [root@h2 ovs]# ovs-appctl -t /run/openvswitch/ovnsb_db.ctl > ovsdb-server/sync-status > > state: backup > > replicating: tcp:10.33.74.77:6642 // The IP address was retained on h2 > > database: OVN_Southbound > > --- > > Any suggestions on what I did wrong? > > > > I think this is mostly due to the crm configuration. Once you add the 'ms' > and 'colocation' resources, you should be able to overcome this problem. > > No, ovndb still failed to launch on h2. [root@h2 ovs]# crm status Last updated: Thu Oct 13 11:27:42 2016 Last change: Thu Oct 13 11:17:25 2016 by root via cibadmin on h2 Stack: corosync Current DC: h2 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum 2 nodes and 3 resources configured *Online*: [ h1 h2 ] Full list of resources: *ClusterIP* (ocf::heartbeat:IPaddr2): *Started* h1 *Master*/*Slave* Set: ovndb-*master* [ovndb] *Master*s: [ h1 ] *Stopped*: [ h2 ] Failed Actions: * ovndb_start_0 on h2 '*unknown error*' (1): call=39, status=*Timed Out*, exitreason='none', last-rc-change='Wed Oct 12 14:43:03 2016', queued=0ms, exec=30003 I have never tried colocating two resources with the ClusterIP resource. > Just for testing, is it possible to drop the WebServer resource? > > Done. It did not make any difference that I can see.
On Friday 14 October 2016 04:00 AM, Andy Zhou wrote: > > > Done. Now it shows the following. > > [root@h2 ovs]# crm configure show > > node1: h1 \ > > attributes > > node2: h2 > > primitiveClusterIP IPaddr2 \ > > paramsip=10.33.75.200cidr_netmask=32\ > > opstart interval=0stimeout=20s\ > > opstop interval=0stimeout=20s\ > > opmonitor interval=30s > > primitiveovndb ocf:ovn:ovndb-servers \ > > opstart interval=0stimeout=30s\ > > opstop interval=0stimeout=20s\ > > oppromote interval=0stimeout=50s\ > > opdemote interval=0stimeout=50s\ > > opmonitor interval=1min\ > > meta > > msovndb-master ovndb\ > > metanotify=true > > colocationcolocation-ovndb-master-ClusterIP-INFINITY inf: > ovndb-master:Started ClusterIP:Master > > orderorder-ClusterIP-ovndb-master-mandatory inf: ClusterIP:start > ovndb-master:start > > propertycib-bootstrap-options: \ > > have-watchdog=false\ > > dc-version=1.1.13-10.el7_2.4-44eb2dd\ > > cluster-infrastructure=corosync\ > > cluster-name=mycluster\ > > stonith-enabled=false > > propertyovn_ovsdb_master_server: \ > > > > > My installation does not have ocf-tester, There is a program called > ocft with a test option. Not sure if this is a suitable replacement. > If not, how could I get > the ocf-tester program? I ran the ocft program and get the following > output. Not sure what it means. > > [root@h2 ovs]# ocft test -n test-ovndb -o master_ip 10.0.0.1 > /usr/share/openvswitch/scripts/ovndb-servers.ocf > > ERROR: cases directory not found. > > I have attached ocf-tester with this mail. I guess it's a standalone script. If it does not work, I think it's better not to attempt anymore as we have another way to find out. > > > Alternately, you can check if the ovsdb servers are started > properly by running > > /usr/share/openvswitch/scripts/ovn-ctl --db-sb-sync-from=10.0.0.1 > --db-nb-sync-from=10.0.0.1 start_ovsdb > > > The output are as follows. Should we use --db-sb-sync-from-addr instead? > [root@h2 ovs]# /usr/share/openvswitch/scripts/ovn-ctl > --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb > > /usr/share/openvswitch/scripts/ovn-ctl: unknown option > "--db-sb-sync-from=10.0.0.1" (use --help for help) > > /usr/share/openvswitch/scripts/ovn-ctl: unknown option > "--db-nb-sync-from=10.0.0.1" (use --help for help) > > 'ovn-ctl' runs without any error message after I fixed the command > line parameter. I am sorry for the misinformation, Andy. What you ran is correct. Could you check the status of the ovsdb servers in h2 after you run the above command using the following commands. ovn-ctl status_ovnnb ovn-ctl status_ovnsb Both the above commands should return "running/backup". If you see in the OCF script in the function ovsdb_server_start(), we wait indefinitely till the DB servers are started. Since the 'start' action on h2 times out, I doubt that the servers are not started properly. > > I think this is mostly due to the crm configuration. Once you add > the 'ms' and 'colocation' resources, you should be able to > overcome this problem. > > No, ovndb still failed to launch on h2. > > [root@h2 ovs]# crm status > > Last updated: Thu Oct 13 11:27:42 2016Last change: Thu Oct 13 11:17:25 > 2016 by root via cibadmin on h2 > > Stack: corosync > > Current DC: h2 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum > > 2 nodes and 3 resources configured > > > *Online*: [ h1 h2 ] > > > Full list of resources: > > > *ClusterIP*(ocf::heartbeat:IPaddr2):*Started*h1 > > *Master*/*Slave*Set: ovndb-*master*[ovndb] > > *Master*s: [ h1 ] > > *Stopped*: [ h2 ] > > > Failed Actions: > > * ovndb_start_0 on h2 '*unknown error*' (1): call=39, status=*Timed > Out*, exitreason='none', > > last-rc-change='Wed Oct 12 14:43:03 2016', queued=0ms, exec=30003 > > I have never tried colocating two resources with the ClusterIP > resource. Just for testing, is it possible to drop the WebServer > resource? > > Done. It did not make any difference that I can see. > #!/bin/sh # # $Id: ocf-tester,v 1.2 2006/08/14 09:38:20 andrew Exp $ # # Copyright (c) 2006 Novell Inc, Andrew Beekhof # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # LRMD=/usr/lib/heartbeat/lrmd LRMADMIN=/usr/sbin/lrmadmin DATADIR=/usr/share METADATA_LINT="xmllint --noout --valid -" # set some common meta attributes, which are expected to be # present by resource agents export OCF_RESKEY_CRM_meta_timeout=20000 # 20 seconds timeout export OCF_RESKEY_CRM_meta_interval=10000 # reset this for probes num_errors=0 info() { [ "$quiet" -eq 1 ] && return echo "$*" } debug() { [ "$verbose" -eq 0 ] && return echo "$*" } usage() { # make sure to output errors on stderr [ "x$1" = "x0" ] || exec >&2 echo "Tool for testing if a cluster resource is OCF compliant" echo "" echo "Usage: ocf-tester [-Lh] -n resource_name [-o name=value]* /full/path/to/resource/agent" echo "" echo "Options:" echo " -h This text" echo " -v Be verbose while testing" echo " -q Be quiet while testing" echo " -d Turn on RA debugging" echo " -n name Name of the resource" echo " -o name=value Name and value of any parameters required by the agent" echo " -L Use lrmadmin/lrmd for tests" exit $1 } assert() { rc=$1; shift target=$1; shift msg=$1; shift local targetrc matched if [ $# = 0 ]; then exit_code=0 else exit_code=$1; shift fi for targetrc in `echo $target | tr ':' ' '`; do [ $rc -eq $targetrc ] && matched=1 done if [ "$matched" != 1 ]; then num_errors=`expr $num_errors + 1` echo "* rc=$rc: $msg" if [ $exit_code != 0 ]; then [ -n "$command_output" ] && cat<<EOF $command_output EOF echo "Aborting tests" exit $exit_code fi fi command_output="" } done=0 ra_args="" verbose=0 quiet=0 while test "$done" = "0"; do case "$1" in -n) OCF_RESOURCE_INSTANCE=$2; ra_args="$ra_args OCF_RESOURCE_INSTANCE=$2"; shift; shift;; -o) name=${2%%=*}; value=${2#*=}; lrm_ra_args="$lrm_ra_args $2"; ra_args="$ra_args OCF_RESKEY_$name='$value'"; shift; shift;; -L) use_lrmd=1; shift;; -v) verbose=1; shift;; -d) export HA_debug=1; shift;; -q) quiet=1; shift;; -?|--help) usage 0;; --version) echo "UNKNOWN"; exit 0;; -*) echo "unknown option: $1" >&2; usage 1;; *) done=1;; esac done if [ "x" = "x$OCF_ROOT" ]; then if [ -d /usr/lib/ocf ]; then export OCF_ROOT=/usr/lib/ocf else echo "You must supply the location of OCF_ROOT (common location is /usr/lib/ocf)" >&2 usage 1 fi fi if [ "x" = "x$OCF_RESOURCE_INSTANCE" ]; then echo "You must give your resource a name, set OCF_RESOURCE_INSTANCE" >&2 usage 1 fi agent=$1 if [ ! -e $agent ]; then echo "You must provide the full path to your resource agent" >&2 usage 1 fi installed_rc=5 stopped_rc=7 has_demote=1 has_promote=1 start_lrmd() { lrmd_timeout=0 lrmd_interval=0 lrmd_target_rc=EVERYTIME lrmd_started="" $LRMD -s 2>/dev/null rc=$? if [ $rc -eq 3 ]; then lrmd_started=1 $LRMD & sleep 1 $LRMD -s 2>/dev/null else return $rc fi } add_resource() { $LRMADMIN -A $OCF_RESOURCE_INSTANCE \ ocf \ `basename $agent` \ $(basename `dirname $agent`) \ $lrm_ra_args > /dev/null } del_resource() { $LRMADMIN -D $OCF_RESOURCE_INSTANCE } parse_lrmadmin_output() { awk ' BEGIN{ rc=1; } /Waiting for lrmd to callback.../ { n=1; next; } n==1 && /----------------operation--------------/ { n++; next; } n==2 && /return code:/ { rc=$0; sub("return code: *","",rc); next } n==2 && /---------------------------------------/ { n++; next; } END{ if( n!=3 ) exit 1; else exit rc; } ' } exec_resource() { op="$1" args="$2" $LRMADMIN -E $OCF_RESOURCE_INSTANCE \ $op $lrmd_timeout $lrmd_interval \ $lrmd_target_rc \ $args | parse_lrmadmin_output } if [ "$use_lrmd" = 1 ]; then echo "Using lrmd/lrmadmin for all tests" start_lrmd || { echo "could not start lrmd" >&2 exit 1 } trap ' [ "$lrmd_started" = 1 ] && $LRMD -k ' EXIT add_resource || { echo "failed to add resource to lrmd" >&2 exit 1 } fi lrm_test_command() { action="$1" msg="$2" debug "$msg" exec_resource $action "$lrm_ra_args" } test_permissions() { action=meta-data debug ${1:-"Testing permissions with uid nobody"} su nobody -s /bin/sh $agent $action > /dev/null } test_metadata() { action=meta-data msg=${1:-"Testing: $action"} debug $msg bash $agent $action | (cd $DATADIR/resource-agents && $METADATA_LINT) rc=$? #echo rc: $rc return $rc } test_command() { action=$1; shift export __OCF_ACTION=$action msg=${1:-"Testing: $action"} if [ "$use_lrmd" = 1 ]; then lrm_test_command $action "$msg" return $? fi #echo Running: "export $ra_args; bash $agent $action 2>&1 > /dev/null" if [ $verbose -eq 0 ]; then command_output=`bash $agent $action 2>&1` else debug $msg bash $agent $action fi rc=$? #echo rc: $rc return $rc } # Begin tests info "Beginning tests for $agent..." if [ ! -f $agent ]; then assert 7 0 "Could not find file: $agent" fi if [ `id -u` = 0 ]; then test_permissions assert $? 0 "Your agent has too restrictive permissions: should be 755" else echo "WARN: Can't check agent's permissions because we're not root; they should be 755" fi test_metadata assert $? 0 "Your agent produces meta-data which does not conform to ra-api-1.dtd" OCF_TESTER_FAIL_HAVE_BINARY=1 export OCF_TESTER_FAIL_HAVE_BINARY test_command meta-data rc=$? if [ $rc -eq 3 ]; then assert $rc 0 "Your agent does not support the meta-data action" else assert $rc 0 "The meta-data action cannot fail and must return 0" fi unset OCF_TESTER_FAIL_HAVE_BINARY ra_args="export $ra_args" eval $ra_args test_command validate-all rc=$? if [ $rc -eq 3 ]; then assert $rc 0 "Your agent does not support the validate-all action" elif [ $rc -ne 0 ]; then assert $rc 0 "Validation failed. Did you supply enough options with -o ?" 1 usage $rc fi test_command monitor "Checking current state" rc=$? if [ $rc -eq 3 ]; then assert $rc 7 "Your agent does not support the monitor action" 1 elif [ $rc -eq 8 ]; then test_command demote "Cleanup, demote" assert $? 0 "Your agent was a master and could not be demoted" 1 test_command stop "Cleanup, stop" assert $? 0 "Your agent was a master and could not be stopped" 1 elif [ $rc -ne 7 ]; then test_command stop assert $? 0 "Your agent was active and could not be stopped" 1 fi test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" OCF_TESTER_FAIL_HAVE_BINARY=1 export OCF_TESTER_FAIL_HAVE_BINARY OCF_RESKEY_CRM_meta_interval=0 test_command monitor assert $? $stopped_rc:$installed_rc "The initial probe for a stopped resource should return $stopped_rc or $installed_rc even if all binaries are missing" unset OCF_TESTER_FAIL_HAVE_BINARY OCF_RESKEY_CRM_meta_interval=20000 test_command start assert $? 0 "Start failed. Did you supply enough options with -o ?" 1 test_command monitor assert $? 0 "Monitoring an active resource should return 0" OCF_RESKEY_CRM_meta_interval=0 test_command monitor assert $? 0 "Probing an active resource should return 0" OCF_RESKEY_CRM_meta_interval=20000 test_command notify rc=$? if [ $rc -eq 3 ]; then info "* Your agent does not support the notify action (optional)" else assert $rc 0 "The notify action cannot fail and must return 0" fi test_command demote "Checking for demote action" if [ $? -eq 3 ]; then has_demote=0 info "* Your agent does not support the demote action (optional)" fi test_command promote "Checking for promote action" if [ $? -eq 3 ]; then has_promote=0 info "* Your agent does not support the promote action (optional)" fi if [ $has_promote -eq 1 -a $has_demote -eq 1 ]; then test_command demote "Testing: demotion of started resource" assert $? 0 "Demoting a start resource should not fail" test_command promote assert $? 0 "Promote failed" test_command demote assert $? 0 "Demote failed" 1 test_command demote "Testing: demotion of demoted resource" assert $? 0 "Demoting a demoted resource should not fail" test_command promote "Promoting resource" assert $? 0 "Promote failed" 1 test_command promote "Testing: promotion of promoted resource" assert $? 0 "Promoting a promoted resource should not fail" test_command demote "Demoting resource" assert $? 0 "Demote failed" 1 elif [ $has_promote -eq 0 -a $has_demote -eq 0 ]; then info "* Your agent does not support master/slave (optional)" else echo "* Your agent partially supports master/slave" num_errors=`expr $num_errors + 1` fi test_command stop assert $? 0 "Stop failed" 1 test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command start "Restarting resource..." assert $? 0 "Start failed" 1 test_command monitor assert $? 0 "Monitoring an active resource should return 0" test_command start "Testing: starting a started resource" assert $? 0 "Starting a running resource is required to succeed" test_command monitor assert $? 0 "Monitoring an active resource should return 0" test_command stop "Stopping resource" assert $? 0 "Stop could not clean up after multiple starts" 1 test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command stop "Testing: stopping a stopped resource" assert $? 0 "Stopping a stopped resource is required to succeed" test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command migrate_to "Checking for migrate_to action" rc=$? if [ $rc -ne 3 ]; then test_command migrate_from "Checking for migrate_from action" fi if [ $? -eq 3 ]; then info "* Your agent does not support the migrate action (optional)" fi test_command reload "Checking for reload action" if [ $? -eq 3 ]; then info "* Your agent does not support the reload action (optional)" fi if [ $num_errors -gt 0 ]; then echo "Tests failed: $agent failed $num_errors tests" >&2 exit 1 else echo $agent passed all tests exit 0 fi # vim:et:ts=8:sw=4
On Fri, Oct 14, 2016 at 2:52 AM, Babu Shanmugam <bschanmu@redhat.com> wrote: > > > On Friday 14 October 2016 04:00 AM, Andy Zhou wrote: > > > > Done. Now it shows the following. > > [root@h2 ovs]# crm configure show >> >> node 1: h1 \ >> >> attributes >> >> node 2: h2 >> >> primitive ClusterIP IPaddr2 \ >> >> params ip=10.33.75.200 cidr_netmask=32 \ >> >> op start interval=0s timeout=20s \ >> >> op stop interval=0s timeout=20s \ >> >> op monitor interval=30s >> >> primitive ovndb ocf:ovn:ovndb-servers \ >> >> op start interval=0s timeout=30s \ >> >> op stop interval=0s timeout=20s \ >> >> op promote interval=0s timeout=50s \ >> >> op demote interval=0s timeout=50s \ >> >> op monitor interval=1min \ >> >> meta >> >> ms ovndb-master ovndb \ >> >> meta notify=true >> >> colocation colocation-ovndb-master-ClusterIP-INFINITY inf: ovndb-master:Started >> ClusterIP:Master >> >> order order-ClusterIP-ovndb-master-mandatory inf: ClusterIP:start >> ovndb-master:start >> >> property cib-bootstrap-options: \ >> >> have-watchdog=false \ >> >> dc-version=1.1.13-10.el7_2.4-44eb2dd \ >> >> cluster-infrastructure=corosync \ >> >> cluster-name=mycluster \ >> >> stonith-enabled=false >> >> property ovn_ovsdb_master_server: \ >> > > >> >> >> > My installation does not have ocf-tester, There is a program called ocft > with a test option. Not sure if this is a suitable replacement. If not, how > could I get > the ocf-tester program? I ran the ocft program and get the following > output. Not sure what it means. > > [root@h2 ovs]# ocft test -n test-ovndb -o master_ip 10.0.0.1 > /usr/share/openvswitch/scripts/ovndb-servers.ocf > > ERROR: cases directory not found. > > > > I have attached ocf-tester with this mail. I guess it's a standalone > script. If it does not work, I think it's better not to attempt anymore as > we have another way to find out. > > > > Alternately, you can check if the ovsdb servers are started properly by >> running >> >> /usr/share/openvswitch/scripts/ovn-ctl --db-sb-sync-from=10.0.0.1 >> --db-nb-sync-from=10.0.0.1 start_ovsdb >> >> >> The output are as follows. Should we use --db-sb-sync-from-addr instead? > [root@h2 ovs]# /usr/share/openvswitch/scripts/ovn-ctl > --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb > > /usr/share/openvswitch/scripts/ovn-ctl: unknown option > "--db-sb-sync-from=10.0.0.1" (use --help for help) > /usr/share/openvswitch/scripts/ovn-ctl: unknown option > "--db-nb-sync-from=10.0.0.1" (use --help for help) > > 'ovn-ctl' runs without any error message after I fixed the command line > parameter. > > > I am sorry for the misinformation, Andy. What you ran is correct. Could > you check the status of the ovsdb servers in h2 after you run the above > command using the following commands. > > ovn-ctl status_ovnnb > ovn-ctl status_ovnsb > > Both the above commands should return "running/backup". If you see in the > OCF script in the function ovsdb_server_start(), we wait indefinitely > till the DB servers are started. Since the 'start' action on h2 times out, > I doubt that the servers are not started properly. > > O.K. I was able to get both server up. Mostly by try-and-error. [root@h2 openvswitch]# crm status >> >> Last updated: Fri Oct 14 09:34:50 2016 Last change: Thu Oct 13 11:17:25 >> 2016 by root via cibadmin on h2 >> >> Stack: corosync >> >> Current DC: h1 (version 1.1.13-10.el7_2.4-44eb2dd) - partition with quorum >> >> 2 nodes and 3 resources configured >> >> >> *Online*: [ h1 h2 ] >> >> >> Full list of resources: >> >> >> *ClusterIP* (ocf::heartbeat:IPaddr2): *Started* h1 >> >> *Master*/*Slave* Set: ovndb-*master* [ovndb] >> >> *Master*s: [ h1 ] >> >> *Slave*s: [ h2 ] >> > At this point, I think the scripts can work and will probably work for the integration task you had in mind. However, when it does not work, debugging it may not be tribal. Not sure if this is a show stopper. I also don't know pacemaker well enough to compare OVN with other HA components. From this email thread, It should be clear the patch set can use more documentation, 1) Enhance ovn-ctl man page 2) Add an integration guide on integration with pacemaker, or add an 'HA' section in the IntegrationGuide.md. 3) consider adding logs in case of error to help trouble shooting, I don't have specific suggestions Would you please make those changes and post a V2? Thanks.
On Saturday 15 October 2016 02:25 AM, Andy Zhou wrote: > > > On Fri, Oct 14, 2016 at 2:52 AM, Babu Shanmugam <bschanmu@redhat.com > <mailto:bschanmu@redhat.com>> wrote: > > > > On Friday 14 October 2016 04:00 AM, Andy Zhou wrote: >> >> >> Done. Now it shows the following. >> >> [root@h2 ovs]# crm configure show >> >> node1: h1 \ >> >> attributes >> >> node2: h2 >> >> primitiveClusterIP IPaddr2 \ >> >> paramsip=10.33.75.200cidr_netmask=32\ >> >> opstart interval=0stimeout=20s\ >> >> opstop interval=0stimeout=20s\ >> >> opmonitor interval=30s >> >> primitiveovndb ocf:ovn:ovndb-servers \ >> >> opstart interval=0stimeout=30s\ >> >> opstop interval=0stimeout=20s\ >> >> oppromote interval=0stimeout=50s\ >> >> opdemote interval=0stimeout=50s\ >> >> opmonitor interval=1min\ >> >> meta >> >> msovndb-master ovndb\ >> >> metanotify=true >> >> colocationcolocation-ovndb-master-ClusterIP-INFINITY inf: >> ovndb-master:Started ClusterIP:Master >> >> orderorder-ClusterIP-ovndb-master-mandatory inf: >> ClusterIP:start ovndb-master:start >> >> propertycib-bootstrap-options: \ >> >> have-watchdog=false\ >> >> dc-version=1.1.13-10.el7_2.4-44eb2dd\ >> >> cluster-infrastructure=corosync\ >> >> cluster-name=mycluster\ >> >> stonith-enabled=false >> >> propertyovn_ovsdb_master_server: \ >> >> >> >> >> My installation does not have ocf-tester, There is a program >> called ocft with a test option. Not sure if this is a suitable >> replacement. If not, how could I get >> the ocf-tester program? I ran the ocft program and get the >> following output. Not sure what it means. >> >> [root@h2 ovs]# ocft test -n test-ovndb -o master_ip 10.0.0.1 >> /usr/share/openvswitch/scripts/ovndb-servers.ocf >> >> ERROR: cases directory not found. >> >> > > I have attached ocf-tester with this mail. I guess it's a > standalone script. If it does not work, I think it's better not to > attempt anymore as we have another way to find out. > >> >> >> Alternately, you can check if the ovsdb servers are started >> properly by running >> >> /usr/share/openvswitch/scripts/ovn-ctl >> --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb >> >> >> The output are as follows. Should we use --db-sb-sync-from-addr >> instead? >> [root@h2 ovs]# /usr/share/openvswitch/scripts/ovn-ctl >> --db-sb-sync-from=10.0.0.1 --db-nb-sync-from=10.0.0.1 start_ovsdb >> >> /usr/share/openvswitch/scripts/ovn-ctl: unknown option >> "--db-sb-sync-from=10.0.0.1" (use --help for help) >> >> /usr/share/openvswitch/scripts/ovn-ctl: unknown option >> "--db-nb-sync-from=10.0.0.1" (use --help for help) >> >> 'ovn-ctl' runs without any error message after I fixed the >> command line parameter. > > I am sorry for the misinformation, Andy. What you ran is correct. > Could you check the status of the ovsdb servers in h2 after you > run the above command using the following commands. > > ovn-ctl status_ovnnb > ovn-ctl status_ovnsb > > Both the above commands should return "running/backup". If you see > in the OCF script in the function ovsdb_server_start(), we wait > indefinitely till the DB servers are started. Since the 'start' > action on h2 times out, I doubt that the servers are not started > properly. > > O.K. I was able to get both server up. Mostly by try-and-error. > >> [root@h2 openvswitch]# crm status >> >> Last updated: Fri Oct 14 09:34:50 2016Last change: Thu Oct 13 >> 11:17:25 2016 by root via cibadmin on h2 >> >> Stack: corosync >> >> Current DC: h1 (version 1.1.13-10.el7_2.4-44eb2dd) - >> partition with quorum >> >> 2 nodes and 3 resources configured >> >> >> *Online*: [ h1 h2 ] >> >> >> Full list of resources: >> >> >> *ClusterIP*(ocf::heartbeat:IPaddr2):*Started*h1 >> >> *Master*/*Slave*Set: ovndb-*master*[ovndb] >> >> *Master*s: [ h1 ] >> >> *Slave*s: [ h2 ] >> > At this point, I think the scripts can work and will probably work for > the integration task you had in mind. However, when it does not work, > debugging it may not be tribal. Not sure if this is a show stopper. I > also don't know pacemaker well enough to compare OVN with other HA > components. > > From this email thread, It should be clear the patch set can use more > documentation, 1) Enhance ovn-ctl man page 2) Add an integration > guide on integration with pacemaker, or add an 'HA' section in the > IntegrationGuide.md. 3) consider adding logs in case of error to help > trouble shooting, I don't have specific suggestions > > Would you please make those changes and post a V2? Thanks. > I agree Andy. I will post a V2 soon. Thank you, Babu
diff --git a/ovn/utilities/ovn-ctl b/ovn/utilities/ovn-ctl index 07bff8a..1c1687f 100755 --- a/ovn/utilities/ovn-ctl +++ b/ovn/utilities/ovn-ctl @@ -26,6 +26,9 @@ for dir in "$sbindir" "$bindir" /sbin /bin /usr/sbin /usr/bin; do done +ovnnb_active_conf_file="$etcdir/ovnnb-active.conf" +ovnsb_active_conf_file="$etcdir/ovnsb-active.conf" +ovn_northd_db_conf_file="$etcdir/ovn-northd-db-params.conf" ## ----- ## ## start ## ## ----- ## @@ -45,6 +48,44 @@ stop_ovsdb () { fi } +demote_ovnnb() { + if test ! -z "$DB_NB_SYNC_FROM_ADDR"; then + echo "tcp:$DB_NB_SYNC_FROM_ADDR:$DB_NB_SYNC_FROM_PORT" > $ovnnb_active_conf_file + fi + + if test -e $ovnnb_active_conf_file; then + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/set-active-ovsdb-server `cat $ovnnb_active_conf_file` + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/connect-active-ovsdb-server + else + echo >&2 "$0: active server details not set" + exit 1 + fi +} + +demote_ovnsb() { + if test ! -z "$DB_SB_SYNC_FROM_ADDR"; then + echo "tcp:$DB_SB_SYNC_FROM_ADDR:$DB_SB_SYNC_FROM_PORT" > $ovnsb_active_conf_file + fi + + if test -e $ovnsb_active_conf_file; then + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/set-active-ovsdb-server `cat $ovnsb_active_conf_file` + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/connect-active-ovsdb-server + else + echo >&2 "$0: active server details not set" + exit 1 + fi +} + +promote_ovnnb() { + rm -f $ovnnb_active_conf_file + ovs-appctl -t $rundir/ovnnb_db.ctl ovsdb-server/disconnect-active-ovsdb-server +} + +promote_ovnsb() { + rm -f $ovnsb_active_conf_file + ovs-appctl -t $rundir/ovnsb_db.ctl ovsdb-server/disconnect-active-ovsdb-server +} + start_ovsdb () { # Check and eventually start ovsdb-server for Northbound DB if ! pidfile_is_running $DB_NB_PID; then @@ -52,7 +93,20 @@ start_ovsdb () { set ovsdb-server - set "$@" --detach --monitor $OVN_NB_LOG --log-file=$OVN_NB_LOGFILE --remote=punix:$DB_NB_SOCK --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR --pidfile=$DB_NB_PID --unixctl=ovnnb_db.ctl + set "$@" --detach --monitor $OVN_NB_LOG \ + --log-file=$OVN_NB_LOGFILE \ + --remote=punix:$DB_NB_SOCK \ + --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR \ + --pidfile=$DB_NB_PID \ + --unixctl=ovnnb_db.ctl + + if test ! -z "$DB_NB_SYNC_FROM_ADDR"; then + echo "tcp:$DB_NB_SYNC_FROM_ADDR:$DB_NB_SYNC_FROM_PORT" > $ovnnb_active_conf_file + fi + + if test -e $ovnnb_active_conf_file; then + set "$@" --sync-from=`cat $ovnnb_active_conf_file` + fi $@ $DB_NB_FILE fi @@ -63,11 +117,45 @@ start_ovsdb () { set ovsdb-server - set "$@" --detach --monitor $OVN_SB_LOG --log-file=$OVN_SB_LOGFILE --remote=punix:$DB_SB_SOCK --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR --pidfile=$DB_SB_PID --unixctl=ovnsb_db.ctl + set "$@" --detach --monitor $OVN_SB_LOG \ + --log-file=$OVN_SB_LOGFILE \ + --remote=punix:$DB_SB_SOCK \ + --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR \ + --pidfile=$DB_SB_PID \ + --unixctl=ovnsb_db.ctl + + if test ! -z "$DB_SB_SYNC_FROM_ADDR"; then + echo "tcp:$DB_SB_SYNC_FROM_ADDR:$DB_SB_SYNC_FROM_PORT" > $ovnsb_active_conf_file + fi + + if test -e $ovnsb_active_conf_file; then + set "$@" --sync-from=`cat $ovnsb_active_conf_file` + fi + $@ $DB_SB_FILE fi } +sync_status() { + ovs-appctl -t $rundir/ovn${1}_db.ctl ovsdb-server/sync-status | awk '{if(NR==1) print $2}' +} + +status_ovnnb() { + if ! pidfile_is_running $DB_NB_PID; then + echo "not-running" + else + echo "running/$(sync_status nb)" + fi +} + +status_ovnsb() { + if ! pidfile_is_running $DB_SB_PID; then + echo "not-running" + else + echo "running/$(sync_status sb)" + fi +} + status_ovsdb () { if ! pidfile_is_running $DB_NB_PID; then log_success_msg "OVN Northbound DB is not running" @@ -83,29 +171,36 @@ status_ovsdb () { } start_northd () { - if test X"$OVN_MANAGE_OVSDB" = Xyes; then - start_ovsdb - fi + if [ ! -e $ovn_northd_db_conf_file ]; then + if test X"$OVN_MANAGE_OVSDB" = Xyes; then + start_ovsdb + fi + + if ! pidfile_is_running $DB_NB_PID; then + log_failure_msg "OVN Northbound DB is not running" + exit + fi + if ! pidfile_is_running $DB_SB_PID; then + log_failure_msg "OVN Southbound DB is not running" + exit + fi + ovn_northd_params="--ovnnb-db=unix:$DB_NB_SOCK --ovnsb-db=unix:$DB_SB_SOCK" + else + ovn_northd_params="`cat $ovn_northd_db_conf_file`" + fi - if ! pidfile_is_running $DB_NB_PID; then - log_failure_msg "OVN Northbound DB is not running" - exit - fi - if ! pidfile_is_running $DB_SB_PID; then - log_failure_msg "OVN Southbound DB is not running" - exit - fi + if daemon_is_running ovn-northd; then + log_success_msg "ovn-northd is already running" + else + set ovn-northd + if test X"$OVN_NORTHD_LOGFILE" != X; then + set "$@" --log-file=$OVN_NORTHD_LOGFILE + fi - if daemon_is_running ovn-northd; then - log_success_msg "ovn-northd is already running" - else - set ovn-northd - if test X"$OVN_NORTHD_LOGFILE" != X; then - set "$@" --log-file=$OVN_NORTHD_LOGFILE - fi - set "$@" $OVN_NORTHD_LOG --ovnnb-db=unix:$DB_NB_SOCK --ovnsb-db=unix:$DB_SB_SOCK - OVS_RUNDIR=${OVN_RUNDIR} start_daemon "$OVN_NORTHD_PRIORITY" "$OVN_NORTHD_WRAPPER" "$@" - fi + set "$@" $OVN_NORTHD_LOG $ovn_northd_params + + OVS_RUNDIR=${OVN_RUNDIR} start_daemon "$OVN_NORTHD_PRIORITY" "$OVN_NORTHD_WRAPPER" "$@" + fi } start_controller () { @@ -127,8 +222,10 @@ start_controller_vtep () { stop_northd () { OVS_RUNDIR=${OVN_RUNDIR} stop_daemon ovn-northd - if test X"$OVN_MANAGE_OVSDB" = Xyes; then - stop_ovsdb + if [ ! -e $ovn_northd_db_conf_file ]; then + if test X"$OVN_MANAGE_OVSDB" = Xyes; then + stop_ovsdb + fi fi } @@ -176,12 +273,16 @@ set_defaults () { DB_NB_FILE=$dbdir/ovnnb_db.db DB_NB_ADDR=0.0.0.0 DB_NB_PORT=6641 + DB_NB_SYNC_FROM_ADDR= + DB_NB_SYNC_FROM_PORT=6641 DB_SB_SOCK=$rundir/ovnsb_db.sock DB_SB_PID=$rundir/ovnsb_db.pid DB_SB_FILE=$dbdir/ovnsb_db.db DB_SB_ADDR=0.0.0.0 DB_SB_PORT=6642 + DB_SB_SYNC_FROM_ADDR= + DB_SB_SYNC_FROM_PORT=6642 DB_NB_SCHEMA=$datadir/ovn-nb.ovsschema DB_SB_SCHEMA=$datadir/ovn-sb.ovsschema @@ -272,6 +373,10 @@ File location options: --db-sb-port=PORT OVN Southbound db ptcp port (default: $DB_SB_PORT) --ovn-nb-logfile=FILE OVN Northbound log file (default: $OVN_NB_LOGFILE) --ovn-sb-logfile=FILE OVN Southbound log file (default: $OVN_SB_LOGFILE) + --db-nb-sync-from-addr=ADDR OVN Northbound active db tcp address (default: $DB_NB_SYNC_FROM_ADDR) + --db-nb-sync-from-port=PORT OVN Northdbound active db tcp port (default: $DB_NB_SYNC_FROM_PORT) + --db-sb-sync-from-addr=ADDR OVN Southbound active db tcp address (default: $DB_SB_SYNC_FROM_ADDR) + --db-sb-sync-from-port=ADDR OVN Southbound active db tcp port (default: $DB_SB_SYNC_FROM_PORT) Default directories with "configure" option and environment variable override: logs: /usr/local/var/log/openvswitch (--with-logdir, OVS_LOGDIR) @@ -377,6 +482,24 @@ case $command in status_controller_vtep) daemon_status ovn-controller-vtep || exit 1 ;; + promote_ovnnb) + promote_ovnnb + ;; + promote_ovnsb) + promote_ovnsb + ;; + demote_ovnnb) + demote_ovnnb + ;; + demote_ovnsb) + demote_ovnsb + ;; + status_ovnnb) + status_ovnnb + ;; + status_ovnsb) + status_ovnsb + ;; help) usage ;;