diff mbox series

[ovs-dev,RFC,52/52] [RFC} ovsdb: Introduce support for clustered databases.

Message ID 20170919220125.32535-53-blp@ovn.org
State RFC
Headers show
Series clustering implementation | expand

Commit Message

Ben Pfaff Sept. 19, 2017, 10:01 p.m. UTC
This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

Signed-off-by: Ben Pfaff <blp@ovn.org>
---
 lib/.gitignore                  |    3 +
 lib/automake.mk                 |   10 +
 lib/ovsdb-idl.c                 |  502 +++--
 lib/ovsdb-server-idl.ann        |    9 +
 lib/ovsdb-session.c             |   72 +
 lib/ovsdb-session.h             |   25 +
 lib/uuid.h                      |   12 +
 ovn/controller/ovn-controller.c |    1 +
 ovsdb/TODO.rst                  |   64 +
 ovsdb/_server.ovsschema         |   16 +-
 ovsdb/_server.xml               |   42 +-
 ovsdb/automake.mk               |   10 +
 ovsdb/execution.c               |   95 +-
 ovsdb/file.c                    |  741 +------
 ovsdb/file.h                    |   38 +-
 ovsdb/jsonrpc-server.c          |    9 +
 ovsdb/log.c                     |  266 ++-
 ovsdb/log.h                     |    9 +-
 ovsdb/ovsdb-client.1.in         |   27 +-
 ovsdb/ovsdb-client.c            |  161 +-
 ovsdb/ovsdb-server.1.in         |   78 +-
 ovsdb/ovsdb-server.c            |  287 ++-
 ovsdb/ovsdb-tool.1.in           |  123 +-
 ovsdb/ovsdb-tool.c              |  503 ++++-
 ovsdb/ovsdb.5.xml               |  219 ++-
 ovsdb/ovsdb.7.xml               |  159 +-
 ovsdb/ovsdb.c                   |  118 +-
 ovsdb/ovsdb.h                   |   26 +-
 ovsdb/raft-private.c            |  358 ++++
 ovsdb/raft-private.h            |  123 ++
 ovsdb/raft-rpc.c                |  788 ++++++++
 ovsdb/raft-rpc.h                |  271 +++
 ovsdb/raft.c                    | 4105 +++++++++++++++++++++++++++++++++++++++
 ovsdb/raft.h                    |  142 ++
 ovsdb/replication.c             |    6 +-
 ovsdb/row.c                     |    3 +
 ovsdb/server.c                  |   14 +-
 ovsdb/server.h                  |    2 +-
 ovsdb/storage.c                 |  528 +++++
 ovsdb/storage.h                 |   88 +
 ovsdb/transaction.c             |  210 +-
 ovsdb/transaction.h             |   19 +-
 ovsdb/trigger.c                 |  228 ++-
 ovsdb/trigger.h                 |   26 +-
 tests/.gitignore                |    1 +
 tests/automake.mk               |   12 +-
 tests/ovsdb-cluster.at          |   78 +
 tests/ovsdb-idl.at              |    2 +-
 tests/ovsdb-log.at              |    2 +-
 tests/ovsdb-server.at           |   15 +-
 tests/ovsdb.at                  |    1 +
 tests/test-ovsdb.c              |    9 +-
 tests/test-raft.c               |  303 +++
 tests/test-raft.sh              |   13 +
 tests/test-raft2.sh             |   12 +
 tests/test-raft3.sh             |   14 +
 tests/test-raft4.sh             |   37 +
 tests/torture-raft4.sh          |   23 +
 tutorial/ovs-sandbox            |    2 +-
 59 files changed, 9775 insertions(+), 1285 deletions(-)
 create mode 100644 lib/ovsdb-server-idl.ann
 create mode 100644 lib/ovsdb-session.c
 create mode 100644 lib/ovsdb-session.h
 create mode 100644 ovsdb/TODO.rst
 create mode 100644 ovsdb/raft-private.c
 create mode 100644 ovsdb/raft-private.h
 create mode 100644 ovsdb/raft-rpc.c
 create mode 100644 ovsdb/raft-rpc.h
 create mode 100644 ovsdb/raft.c
 create mode 100644 ovsdb/raft.h
 create mode 100644 ovsdb/storage.c
 create mode 100644 ovsdb/storage.h
 create mode 100644 tests/ovsdb-cluster.at
 create mode 100644 tests/test-raft.c
 create mode 100755 tests/test-raft.sh
 create mode 100755 tests/test-raft2.sh
 create mode 100755 tests/test-raft3.sh
 create mode 100755 tests/test-raft4.sh
 create mode 100755 tests/torture-raft4.sh
diff mbox series

Patch

diff --git a/lib/.gitignore b/lib/.gitignore
index 0680af657b37..7d7f4271b4f2 100644
--- a/lib/.gitignore
+++ b/lib/.gitignore
@@ -9,6 +9,9 @@ 
 /ofp-actions.inc2
 /ofp-errors.inc
 /ofp-msgs.inc
+/ovsdb-server-idl.c
+/ovsdb-server-idl.h
+/ovsdb-server-idl.ovsidl
 /ovs-fields.7
 /stdio.h
 /string.h
diff --git a/lib/automake.mk b/lib/automake.mk
index 2415f4cd6c25..bcf57195dffb 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -196,6 +196,8 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/ovsdb-condition.c \
 	lib/ovsdb-parser.c \
 	lib/ovsdb-parser.h \
+	lib/ovsdb-session.c \
+	lib/ovsdb-session.h \
 	lib/ovsdb-types.c \
 	lib/ovsdb-types.h \
 	lib/packets.c \
@@ -326,6 +328,8 @@  EXTRA_DIST += \
 
 nodist_lib_libopenvswitch_la_SOURCES = \
 	lib/dirs.c \
+	lib/ovsdb-server-idl.c \
+	lib/ovsdb-server-idl.h \
 	lib/vswitch-idl.c \
 	lib/vswitch-idl.h
 CLEANFILES += $(nodist_lib_libopenvswitch_la_SOURCES)
@@ -542,6 +546,12 @@  lib/ofp-msgs.lo: lib/ofp-msgs.inc
 CLEANFILES += lib/ofp-msgs.inc
 EXTRA_DIST += build-aux/extract-ofp-msgs
 
+# _server IDL
+OVSIDL_BUILT += lib/ovsdb-server-idl.c lib/ovsdb-server-idl.h lib/ovsdb-server-idl.ovsidl
+EXTRA_DIST += lib/ovsdb-server-idl.ann
+lib/ovsdb-server-idl.ovsidl: ovsdb/_server.ovsschema lib/ovsdb-server-idl.ann
+	$(AM_V_GEN)$(OVSDB_IDLC) annotate $(srcdir)/ovsdb/_server.ovsschema $(srcdir)/lib/ovsdb-server-idl.ann > $@.tmp && mv $@.tmp $@
+
 INSTALL_DATA_LOCAL += lib-install-data-local
 lib-install-data-local:
 	$(MKDIR_P) $(DESTDIR)$(PKIDIR)
diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c
index 949c15e11940..b37917dfe3b2 100644
--- a/lib/ovsdb-idl.c
+++ b/lib/ovsdb-idl.c
@@ -36,10 +36,13 @@ 
 #include "ovsdb-error.h"
 #include "ovsdb-idl-provider.h"
 #include "ovsdb-parser.h"
+#include "ovsdb-server-idl.h"
+#include "ovsdb-session.h"
 #include "poll-loop.h"
 #include "openvswitch/shash.h"
 #include "skiplist.h"
 #include "sset.h"
+#include "svec.h"
 #include "util.h"
 #include "uuid.h"
 #include "openvswitch/vlog.h"
@@ -81,42 +84,92 @@  struct ovsdb_idl_arc {
 
 /* Connection state machine.
  *
- * When a JSON-RPC session connects, the IDL sends a "get_schema" request and
- * transitions to IDL_S_SCHEMA_REQUESTED.  If the session drops and reconnects,
- * the IDL starts over again in the same way. */
-enum ovsdb_idl_state {
-    /* Waits for "get_schema" reply, then sends a "monitor_cond" request whose
-     * details are informed by the schema and transitions to
-     * IDL_S_MONITOR_COND_REQUESTED. */
-    IDL_S_SCHEMA_REQUESTED,
+ * When a JSON-RPC session connects, the IDL sends a "monitor_cond" request for
+ * the Database table in the _Server database and transitions to the
+ * IDL_S_SERVER_MONITOR_COND_REQUESTED state.  If the session drops and
+ * reconnects, the IDL starts over again in the same way. */
+#define OVSDB_IDL_STATES                                                \
+    /* Waits for "get_schema" reply, then sends "monitor_cond"          \
+     * request for the Database table in the _Server database, whose    \
+     * details are informed by the schema, and transitions to           \
+     * IDL_S_SERVER_MONITOR_COND_REQUESTED. */                          \
+    OVSDB_IDL_STATE(SERVER_SCHEMA_REQUESTED)                            \
+                                                                        \
+    /* Waits for "monitor_cond" reply for the Database table:           \
+     *                                                                  \
+     * - If the reply indicates success, and the Database table has a   \
+     *   row for the IDL database:                                      \
+     *                                                                  \
+     *   * If the row indicates that this is a clustered database       \
+     *     that is not connected to the cluster, closes the             \
+     *     connection.  The next connection attempt has a chance at     \
+     *     picking a connected server.                                  \
+     *                                                                  \
+     *   * Otherwise, sends a "monitor_cond" request for the IDL        \
+     *     database whose details are informed by the schema            \
+     *     (obtained from the row), and transitions to                  \
+     *     IDL_S_DATA_MONITOR_COND_REQUESTED.                           \
+     *                                                                  \
+     *     XXX Should also send set_db_change_aware and monitor the     \
+     *     database status                                              \
+     *                                                                  \
+     * - If the reply indicates success, but the Database table does    \
+     *   not have a row for the IDL database, transitions to            \
+     *   IDL_S_ERROR.                                                   \
+     *                                                                  \
+     * - If the reply indicates failure, sends a "get_schema" request   \
+     *   for the IDL database and transitions to                        \
+     *   IDL_S_DATA_SCHEMA_REQUESTED. */                                \
+    OVSDB_IDL_STATE(SERVER_MONITOR_COND_REQUESTED)                      \
+                                                                        \
+    /* Waits for "get_schema" reply, then sends "monitor_cond"          \
+     * request whose details are informed by the schema, and            \
+     * transitions to IDL_S_DATA_MONITOR_COND_REQUESTED. */             \
+    OVSDB_IDL_STATE(DATA_SCHEMA_REQUESTED)                              \
+                                                                        \
+    /* Waits for "monitor_cond" reply.  If successful, replaces the     \
+     * IDL contents by the data carried in the reply and transitions    \
+     * to IDL_S_MONITORING.  On failure, sends a "monitor" request      \
+     * and transitions to IDL_S_DATA_MONITOR_REQUESTED. */              \
+    OVSDB_IDL_STATE(DATA_MONITOR_COND_REQUESTED)                        \
+                                                                        \
+    /* Waits for "monitor" reply.  If successful, replaces the IDL      \
+     * contents by the data carried in the reply and transitions to     \
+     * IDL_S_MONITORING.  On failure, transitions to IDL_S_ERROR. */    \
+    OVSDB_IDL_STATE(DATA_MONITOR_REQUESTED)                             \
+                                                                        \
+    /* State that processes "update" or "update2" notifications for     \
+     * the main database (and the Database table in _Server if          \
+     * available).                                                      \
+     *                                                                  \
+     * If we're monitoring the Database table and we get notified       \
+     * that the IDL database has been deleted, we close the             \
+     * connection (which will restart the state machine). */            \
+    OVSDB_IDL_STATE(MONITORING)                                         \
+    OVSDB_IDL_STATE(MONITORING_COND)                                    \
+                                                                        \
+    /* Terminal error state that indicates that nothing useful can be   \
+     * done, for example because the database server doesn't actually   \
+     * have the desired database.  We maintain the session with the     \
+     * database server anyway.  If it starts serving the database       \
+     * that we want, or if someone fixes and restarts the database,     \
+     * then it will kill the session and we will automatically          \
+     * reconnect and try again. */                                      \
+    OVSDB_IDL_STATE(ERROR)                                              \
+                                                                        \
+    /* Terminal state that indicates we connected to a useless server   \
+     * in a cluster, e.g. one that is partitioned from the rest of      \
+     * the cluster. We're waiting to retry. */                          \
+    OVSDB_IDL_STATE(RETRY)
 
-    /* Waits for "monitor_cond" reply:
-     *
-     *    - If the reply indicates success, replaces the IDL contents by the
-     *      data carried in the reply and transitions to IDL_S_MONITORING_COND.
-     *
-     *    - If the reply indicates failure because the database is too old to
-     *      support monitor_cond, sends a "monitor" request and transitions to
-     *      IDl_S_MONITOR_REQUESTED.  */
-    IDL_S_MONITOR_COND_REQUESTED,
-
-    /* Waits for "monitor" reply, then replaces the IDL contents by the data
-     * carried in the reply and transitions to IDL_S_MONITORING.  */
-    IDL_S_MONITOR_REQUESTED,
-
-    /* Terminal states that process "update2" (IDL_S_MONITORING_COND) or
-     * "update" (IDL_S_MONITORING) notifications. */
-    IDL_S_MONITORING_COND,
-    IDL_S_MONITORING,
-
-    /* Terminal error state that indicates that nothing useful can be done.
-     * The most likely reason is that the database server doesn't actually have
-     * the desired database.  We maintain the session with the database server
-     * anyway.  If it starts serving the database that we want, then it will
-     * kill the session and we will automatically reconnect and try again. */
-    IDL_S_NO_SCHEMA
+enum ovsdb_idl_state {
+#define OVSDB_IDL_STATE(NAME) IDL_S_##NAME,
+    OVSDB_IDL_STATES
+#undef OVSDB_IDL_STATE
 };
 
+static const char *ovsdb_idl_state_to_string(enum ovsdb_idl_state);
+
 struct ovsdb_idl_db {
     struct ovsdb_idl *idl;
 
@@ -159,6 +212,8 @@  static unsigned int ovsdb_idl_db_set_condition(
 
 static void ovsdb_idl_send_schema_request(struct ovsdb_idl *,
                                           struct ovsdb_idl_db *);
+static void ovsdb_idl_send_db_change_aware(struct ovsdb_idl *);
+static bool ovsdb_idl_check_server_db(struct ovsdb_idl *);
 static void ovsdb_idl_send_monitor_request(struct ovsdb_idl *,
                                            struct ovsdb_idl_db *,
                                            bool use_monitor_cond);
@@ -178,9 +233,18 @@  struct ovsdb_idl {
     unsigned int state_seqno;        /* See above. */
     struct json *request_id;         /* JSON ID for request awaiting reply. */
 
+    struct uuid cid;
+
     bool use_monitor_cond;
+    bool monitoring_server;
+    bool leader_only;
 };
 
+static void ovsdb_idl_transition_at(struct ovsdb_idl *, enum ovsdb_idl_state,
+                                    const char *where);
+#define ovsdb_idl_transition(IDL, STATE) \
+    ovsdb_idl_transition_at(IDL, STATE, OVS_SOURCE_LOCATOR)
+
 struct ovsdb_idl_txn {
     struct hmap_node hmap_node;
     struct json *request_id;
@@ -296,6 +360,18 @@  static void ovsdb_idl_add_to_indexes(const struct ovsdb_idl_row *);
 static void ovsdb_idl_remove_from_indexes(const struct ovsdb_idl_row *);
 
 static void
+ovsdb_idl_open_session(struct ovsdb_idl *idl, const char *remote, bool retry)
+{
+    ovs_assert(!idl->db.txn);
+    jsonrpc_session_close(idl->session);
+
+    struct svec remotes = SVEC_EMPTY_INITIALIZER;
+    ovsdb_session_parse_remote(remote, &remotes, &idl->cid);
+    idl->session = jsonrpc_session_open_multiple((const char **) remotes.names,
+                                                 remotes.n, retry);
+}
+
+static void
 ovsdb_idl_db_init(struct ovsdb_idl_db *db, const struct ovsdb_idl_class *class,
                   struct ovsdb_idl *parent, bool monitor_everything_by_default)
 {
@@ -365,10 +441,26 @@  ovsdb_idl_create(const char *remote, const struct ovsdb_idl_class *class,
     struct ovsdb_idl *idl;
 
     idl = xzalloc(sizeof *idl);
+    ovsdb_idl_db_init(&idl->server, &serverrec_idl_class, idl, true);
     ovsdb_idl_db_init(&idl->db, class, idl, monitor_everything_by_default);
-    idl->session = jsonrpc_session_open(remote, retry);
+    ovsdb_idl_open_session(idl, remote, retry);
     idl->state_seqno = UINT_MAX;
     idl->request_id = NULL;
+    idl->leader_only = true;
+
+    /* Monitor the Database table in the _Server database.
+     *
+     * We monitor only the row for 'class', or the row that has the
+     * desired 'cid'. */
+    struct ovsdb_idl_condition cond;
+    ovsdb_idl_condition_init(&cond);
+    if (!uuid_is_zero(&idl->cid)) {
+        serverrec_database_add_clause_cid(&cond, OVSDB_F_EQ, &idl->cid, 1);
+    } else {
+        serverrec_database_add_clause_name(&cond, OVSDB_F_EQ, class->database);
+    }
+    ovsdb_idl_db_set_condition(&idl->server, &serverrec_table_database, &cond);
+    ovsdb_idl_condition_destroy(&cond);
 
     return idl;
 }
@@ -379,7 +471,7 @@  ovsdb_idl_set_remote(struct ovsdb_idl *idl, const char *remote,
                      bool retry)
 {
     if (idl) {
-        idl->session = jsonrpc_session_open(remote, retry);
+        ovsdb_idl_open_session(idl, remote, retry);
         /* XXX update condition */
         idl->state_seqno = UINT_MAX;
     }
@@ -419,6 +511,17 @@  ovsdb_idl_destroy(struct ovsdb_idl *idl)
     }
 }
 
+void
+ovsdb_idl_set_leader_only(struct ovsdb_idl *idl, bool leader_only)
+{
+    idl->leader_only = leader_only;
+    if (leader_only
+        && idl->state == IDL_S_MONITORING_COND
+        && idl->monitoring_server) {
+        ovsdb_idl_check_server_db(idl);
+    }
+}
+
 static void
 ovsdb_idl_db_clear(struct ovsdb_idl_db *db)
 {
@@ -465,6 +568,29 @@  ovsdb_idl_db_clear(struct ovsdb_idl_db *db)
     }
 }
 
+static const char *
+ovsdb_idl_state_to_string(enum ovsdb_idl_state state)
+{
+    switch (state) {
+#define OVSDB_IDL_STATE(NAME) case IDL_S_##NAME: return #NAME;
+        OVSDB_IDL_STATES
+#undef OVSDB_IDL_STATE
+    default: return "<unknown>";
+    }
+}
+
+static void
+ovsdb_idl_transition_at(struct ovsdb_idl *idl, enum ovsdb_idl_state new_state,
+                        const char *where)
+{
+    VLOG_DBG("%s: %s -> %s at %s",
+             jsonrpc_session_get_name(idl->session),
+             ovsdb_idl_state_to_string(idl->state),
+             ovsdb_idl_state_to_string(new_state),
+             where);
+    idl->state = new_state;
+}
+
 static void
 ovsdb_idl_clear(struct ovsdb_idl *idl)
 {
@@ -479,6 +605,138 @@  ovsdb_idl_send_request(struct ovsdb_idl *idl, struct jsonrpc_msg *request)
     jsonrpc_session_send(idl->session, request);
 }
 
+static void
+ovsdb_idl_process_response(struct ovsdb_idl *idl, struct jsonrpc_msg *msg)
+{
+    bool ok = msg->type == JSONRPC_REPLY;
+    if (!ok
+        && idl->state != IDL_S_SERVER_SCHEMA_REQUESTED
+        && idl->state != IDL_S_SERVER_MONITOR_COND_REQUESTED
+        && idl->state != IDL_S_DATA_MONITOR_COND_REQUESTED) {
+        /* XXX Log error. */
+        ovsdb_idl_transition(idl, IDL_S_ERROR);
+        return;
+    }
+
+    switch (idl->state) {
+    case IDL_S_SERVER_SCHEMA_REQUESTED:
+        if (ok) {
+            json_destroy(idl->server.schema);
+            idl->server.schema = json_clone(msg->result);
+            ovsdb_idl_send_monitor_request(idl, &idl->server, true);
+            ovsdb_idl_transition(idl, IDL_S_SERVER_MONITOR_COND_REQUESTED);
+        } else {
+            ovsdb_idl_send_schema_request(idl, &idl->db);
+            ovsdb_idl_transition(idl, IDL_S_DATA_SCHEMA_REQUESTED);
+        }
+        break;
+
+    case IDL_S_SERVER_MONITOR_COND_REQUESTED:
+        if (ok) {
+            idl->monitoring_server = true;
+            ovsdb_idl_db_parse_monitor_reply(&idl->server, msg->result, true);
+            if (ovsdb_idl_check_server_db(idl)) {
+                ovsdb_idl_send_db_change_aware(idl);
+            }
+        } else {
+            ovsdb_idl_send_schema_request(idl, &idl->db);
+            ovsdb_idl_transition(idl, IDL_S_DATA_SCHEMA_REQUESTED);
+        }
+        break;
+
+    case IDL_S_DATA_SCHEMA_REQUESTED:
+        json_destroy(idl->db.schema);
+        idl->db.schema = json_clone(msg->result);
+        ovsdb_idl_send_monitor_request(idl, &idl->db, true);
+        ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_COND_REQUESTED);
+        break;
+
+    case IDL_S_DATA_MONITOR_COND_REQUESTED:
+        if (!ok) {
+            /* "monitor_cond" not supported.  Try "monitor". */
+            ovsdb_idl_send_monitor_request(idl, &idl->db, false);
+            ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_REQUESTED);
+        } else {
+            ovsdb_idl_transition(idl, IDL_S_MONITORING_COND);
+            ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, true);
+        }
+        break;
+
+    case IDL_S_DATA_MONITOR_REQUESTED:
+        ovsdb_idl_transition(idl, IDL_S_MONITORING);
+        ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, false);
+        idl->db.change_seqno++;
+        ovsdb_idl_clear(idl);
+        ovsdb_idl_db_parse_update(&idl->db, msg->result, false);
+        break;
+
+    case IDL_S_MONITORING:
+    case IDL_S_MONITORING_COND:
+        /* We don't normally have a request outstanding in this state.  If we
+         * do, it's a "monitor_cond_change", which means that the conditional
+         * monitor clauses were updated.
+         *
+         * If further condition changes were pending, send them now. */
+        ovsdb_idl_send_cond_change(idl);
+        idl->db.cond_seqno++;
+        break;
+
+    case IDL_S_ERROR:
+    case IDL_S_RETRY:
+        /* Nothing to do in this state. */
+        break;
+
+    default:
+        OVS_NOT_REACHED();
+    }
+}
+
+static void
+ovsdb_idl_process_msg(struct ovsdb_idl *idl, struct jsonrpc_msg *msg)
+{
+    bool is_response = (msg->type == JSONRPC_REPLY ||
+                        msg->type == JSONRPC_ERROR);
+
+    /* Process a reply to an outstanding request. */
+    if (is_response
+        && idl->request_id && json_equal(idl->request_id, msg->id)) {
+        json_destroy(idl->request_id);
+        idl->request_id = NULL;
+        ovsdb_idl_process_response(idl, msg);
+        return;
+    }
+
+    /* Process database contents updates. */
+    if (ovsdb_idl_db_parse_update_rpc(&idl->db, msg)) {
+        return;
+    }
+    if (idl->monitoring_server
+        && ovsdb_idl_db_parse_update_rpc(&idl->server, msg)) {
+        ovsdb_idl_check_server_db(idl);
+    }
+
+    /* Process "lock" replies and related notifications. */
+    if (ovsdb_idl_db_process_lock_replies(&idl->db, msg)) {
+        return;
+    }
+
+    /* Process response to a database transaction we submitted. */
+    if (is_response && ovsdb_idl_db_txn_process_reply(&idl->db, msg)) {
+        return;
+    }
+
+    /* Unknown message.  Log at a low level because this can happen if
+     * ovsdb_idl_txn_destroy() is called to destroy a transaction before we
+     * receive the reply.
+     *
+     * (We could sort those out from other kinds of unknown messages by using
+     * distinctive IDs for transactions, if it seems valuable to do so, and
+     * then it would be possible to use different log levels. XXX?) */
+    VLOG_DBG("%s: received unexpected %s message",
+             jsonrpc_session_get_name(idl->session),
+             jsonrpc_msg_type_to_string(msg->type));
+}
+
 /* Processes a batch of messages from the database server on 'idl'.  This may
  * cause the IDL's contents to change.  The client may check for that with
  * ovsdb_idl_get_seqno(). */
@@ -499,12 +757,11 @@  ovsdb_idl_run(struct ovsdb_idl *idl)
         seqno = jsonrpc_session_get_seqno(idl->session);
         if (idl->state_seqno != seqno) {
             idl->state_seqno = seqno;
-            json_destroy(idl->request_id);
-            idl->request_id = NULL;
             ovsdb_idl_txn_abort_all(idl);
+            ovsdb_idl_send_schema_request(idl, &idl->server);
+            ovsdb_idl_transition(idl, IDL_S_SERVER_SCHEMA_REQUESTED);
+            idl->monitoring_server = false;
 
-            ovsdb_idl_send_schema_request(idl, &idl->db);
-            idl->state = IDL_S_SCHEMA_REQUESTED;
             if (idl->db.lock_name) {
                 jsonrpc_session_send(
                     idl->session, ovsdb_idl_db_compose_lock_request(&idl->db));
@@ -515,98 +772,7 @@  ovsdb_idl_run(struct ovsdb_idl *idl)
         if (!msg) {
             break;
         }
-
-        if (ovsdb_idl_db_parse_update_rpc(&idl->db, msg)) {
-            /* ovsdb_idl_db_parse_update_rpc() did all the processing. */
-        } else if (msg->type == JSONRPC_REPLY
-                   && idl->request_id
-                   && json_equal(idl->request_id, msg->id)) {
-            json_destroy(idl->request_id);
-            idl->request_id = NULL;
-
-            switch (idl->state) {
-            case IDL_S_SCHEMA_REQUESTED:
-                /* Reply to our "get_schema" request. */
-                idl->db.schema = json_clone(msg->result);
-                ovsdb_idl_send_monitor_request(idl, &idl->db, true);
-                idl->state = IDL_S_MONITOR_COND_REQUESTED;
-                break;
-
-            case IDL_S_MONITOR_REQUESTED:
-            case IDL_S_MONITOR_COND_REQUESTED:
-                /* Reply to our "monitor" or "monitor_cond" request. */
-                if (idl->state == IDL_S_MONITOR_REQUESTED) {
-                    idl->state = IDL_S_MONITORING;
-                    ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result,
-                                                     false);
-                } else { /* IDL_S_MONITOR_COND_REQUESTED. */
-                    idl->state = IDL_S_MONITORING_COND;
-                    ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result,
-                                                     true);
-                }
-
-                /* Schema is not useful after monitor request is accepted
-                 * by the server.  */
-                json_destroy(idl->db.schema);
-                idl->db.schema = NULL;
-                break;
-
-            case IDL_S_MONITORING_COND:
-                /* Conditional monitor clauses were updated. Send out
-                 * the next condition changes, in any, immediately. */
-                ovsdb_idl_send_cond_change(idl);
-                idl->db.cond_seqno++;
-                break;
-
-            case IDL_S_MONITORING:
-            case IDL_S_NO_SCHEMA:
-            default:
-                OVS_NOT_REACHED();
-            }
-        } else if (ovsdb_idl_db_process_lock_replies(&idl->db, msg)) {
-            /* ovsdb_idl_db_process_lock_replies() did all the processing. */
-        } else if (msg->type == JSONRPC_ERROR
-                   && idl->state == IDL_S_MONITOR_COND_REQUESTED
-                   && idl->request_id
-                   && json_equal(idl->request_id, msg->id)) {
-            if (msg->error && msg->error->type == JSON_STRING
-                && !strcmp(json_string(msg->error), "unknown method")) {
-                /* Fall back to using "monitor" method.  */
-                json_destroy(idl->request_id);
-                idl->request_id = NULL;
-                ovsdb_idl_send_monitor_request(idl, &idl->db, false);
-                idl->state = IDL_S_MONITOR_REQUESTED;
-            }
-        } else if (msg->type == JSONRPC_ERROR
-                   && idl->state == IDL_S_MONITORING_COND
-                   && idl->request_id
-                   && json_equal(idl->request_id, msg->id)) {
-            json_destroy(idl->request_id);
-            idl->request_id = NULL;
-            VLOG_ERR("%s: conditional monitor update failed",
-                     jsonrpc_session_get_name(idl->session));
-            idl->state = IDL_S_NO_SCHEMA;
-        } else if (msg->type == JSONRPC_ERROR
-                   && idl->state == IDL_S_SCHEMA_REQUESTED
-                   && idl->request_id
-                   && json_equal(idl->request_id, msg->id)) {
-            json_destroy(idl->request_id);
-            idl->request_id = NULL;
-            VLOG_ERR("%s: requested schema not found",
-                     jsonrpc_session_get_name(idl->session));
-            idl->state = IDL_S_NO_SCHEMA;
-        } else if ((msg->type == JSONRPC_ERROR
-                    || msg->type == JSONRPC_REPLY)
-                   && ovsdb_idl_db_txn_process_reply(&idl->db, msg)) {
-            /* ovsdb_idl_txn_process_reply() did everything needful. */
-        } else {
-            /* This can happen if ovsdb_idl_txn_destroy() is called to destroy
-             * a transaction before we receive the reply, so keep the log level
-             * low. */
-            VLOG_DBG("%s: received unexpected %s message",
-                     jsonrpc_session_get_name(idl->session),
-                     jsonrpc_msg_type_to_string(msg->type));
-        }
+        ovsdb_idl_process_msg(idl, msg);
         jsonrpc_msg_destroy(msg);
     }
     ovsdb_idl_row_destroy_postprocess(&idl->db);
@@ -710,7 +876,7 @@  bool
 ovsdb_idl_is_alive(const struct ovsdb_idl *idl)
 {
     return jsonrpc_session_is_alive(idl->session) &&
-           idl->state != IDL_S_NO_SCHEMA;
+           idl->state != IDL_S_ERROR;
 }
 
 /* Returns the last error reported on a connection by 'idl'.  The return value
@@ -727,7 +893,7 @@  ovsdb_idl_get_last_error(const struct ovsdb_idl *idl)
 
     if (err) {
         return err;
-    } else if (idl->state == IDL_S_NO_SCHEMA) {
+    } else if (idl->state == IDL_S_ERROR) {
         return ENOENT;
     } else {
         return 0;
@@ -1529,6 +1695,67 @@  ovsdb_idl_send_schema_request(struct ovsdb_idl *idl,
                                                        db->class_->database)),
                                NULL));
 }
+
+static void
+ovsdb_idl_send_db_change_aware(struct ovsdb_idl *idl)
+{
+    struct jsonrpc_msg *msg = jsonrpc_create_request(
+        "set_db_change_aware", json_array_create_1(json_boolean_create(true)),
+        NULL);
+    jsonrpc_session_send(idl->session, msg);
+}
+
+static bool
+ovsdb_idl_check_server_db(struct ovsdb_idl *idl)
+{
+    const struct serverrec_database *database;
+    SERVERREC_DATABASE_FOR_EACH (database, idl) {
+        if (uuid_is_zero(&idl->cid)
+            ? !strcmp(database->name, idl->db.class_->database)
+            : database->n_cid && uuid_equals(database->cid, &idl->cid)) {
+            break;
+        }
+    }
+
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+    const char *server_name = jsonrpc_session_get_name(idl->session);
+    if (database) {
+        if (!strcmp(database->model, "clustered")
+            && jsonrpc_session_get_n_remotes(idl->session) > 1) {
+            bool ok = false;
+            if (!database->schema) {
+                VLOG_INFO("%s: clustered database server has not yet joined "
+                          "cluster; trying another server", server_name);
+            } else if (!database->connected) {
+                VLOG_INFO("%s: clustered database server is disconnected "
+                          "from cluster; trying another server", server_name);
+            } else if (idl->leader_only && !database->leader) {
+                VLOG_INFO("%s: clustered database server is not cluster "
+                          "leader; trying another server", server_name);
+            } else {
+                ok = true;
+            }
+
+            if (!ok) {
+                ovsdb_idl_force_reconnect(idl);
+                ovsdb_idl_transition(idl, IDL_S_RETRY);
+                return false;
+            }
+        }
+
+        json_destroy(idl->db.schema);
+        idl->db.schema = json_from_string(database->schema);
+        ovsdb_idl_send_monitor_request(idl, &idl->db, true);
+        ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_COND_REQUESTED);
+        return true;
+    } else {
+        VLOG_INFO_RL(&rl, "%s: server does not have %s database",
+                     server_name, idl->db.class_->database);
+        ovsdb_idl_transition(idl, IDL_S_ERROR);
+        return false;
+    }
+}
+
 static void
 log_error(struct ovsdb_error *error)
 {
@@ -2831,7 +3058,14 @@  static struct ovsdb_idl_table *
 ovsdb_idl_table_from_class(const struct ovsdb_idl *idl,
                            const struct ovsdb_idl_table_class *table_class)
 {
-    return ovsdb_idl_db_table_from_class(&idl->db, table_class);
+    struct ovsdb_idl_table *table;
+
+    table = ovsdb_idl_db_table_from_class(&idl->db, table_class);
+    if (!table) {
+         table = ovsdb_idl_db_table_from_class(&idl->server, table_class);
+    }
+
+    return table;
 }
 
 /* Called by ovsdb-idlc generated code. */
diff --git a/lib/ovsdb-server-idl.ann b/lib/ovsdb-server-idl.ann
new file mode 100644
index 000000000000..ffb945b9134c
--- /dev/null
+++ b/lib/ovsdb-server-idl.ann
@@ -0,0 +1,9 @@ 
+# -*- python -*-
+
+# This code, when invoked by "ovsdb-idlc annotate" (by the build
+# process), annotates vswitch.ovsschema with additional data that give
+# the ovsdb-idl engine information about the types involved, so that
+# it can generate more programmer-friendly data structures.
+
+s["idlPrefix"] = "serverrec_"
+s["idlHeader"] = "\"lib/ovsdb-server-idl.h\""
diff --git a/lib/ovsdb-session.c b/lib/ovsdb-session.c
new file mode 100644
index 000000000000..67e7c62b2848
--- /dev/null
+++ b/lib/ovsdb-session.c
@@ -0,0 +1,72 @@ 
+/* Copyright (c) 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "ovsdb-session.h"
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include "svec.h"
+#include "util.h"
+#include "uuid.h"
+
+static const char *
+next_remote(const char *s)
+{
+    for (const char *delimiter = strchr(s, ','); delimiter;
+         delimiter = strchr(delimiter + 1, ',')) {
+        const char *p = delimiter + 1;
+        p += strspn(p, " \t");
+        size_t n_letters = strspn(p, "abcdefghijklmnopqrstuvwxyz");
+        if (n_letters && p[n_letters] == ':') {
+            return delimiter;
+        }
+    }
+    return NULL;
+}
+
+void
+ovsdb_session_parse_remote(const char *s,
+                           struct svec *remotes, struct uuid *cid)
+{
+    *cid = UUID_ZERO;
+    for (;;) {
+        /* Skip white space. */
+        s += strspn(s, " \t");
+        if (*s == '\0') {
+            break;
+        }
+
+        /* Find the start of the next remote  */
+        const char *delimiter = next_remote(s);
+        if (!delimiter) {
+            svec_add(remotes, s);
+            break;
+        }
+        svec_add_nocopy(remotes, xmemdup0(s, delimiter - s));
+        s = delimiter + 1;
+    }
+
+    size_t i;
+    for (i = 0; i < remotes->n; i++) {
+        const char *name = remotes->names[i];
+        struct uuid uuid;
+        if (!strncmp(name, "cid:", 4) && uuid_from_string(&uuid, name + 4)) {
+            *cid = uuid;
+            svec_del(remotes, name);
+            break;
+        }
+    }
+}
diff --git a/lib/ovsdb-session.h b/lib/ovsdb-session.h
new file mode 100644
index 000000000000..88835cd3dd85
--- /dev/null
+++ b/lib/ovsdb-session.h
@@ -0,0 +1,25 @@ 
+/* Copyright (c) 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef OVSDB_SESSION_H
+#define OVSDB_SESSION_H 1
+
+struct svec;
+struct uuid;
+
+void ovsdb_session_parse_remote(const char *s,
+                                struct svec *remotes, struct uuid *cid);
+
+#endif /* ovsdb-session.h */
diff --git a/lib/uuid.h b/lib/uuid.h
index dba6e1c11ea1..8c24746b5263 100644
--- a/lib/uuid.h
+++ b/lib/uuid.h
@@ -57,6 +57,18 @@  uuid_equals(const struct uuid *a, const struct uuid *b)
             && a->parts[3] == b->parts[3]);
 }
 
+/* Returns the first 'n' hex digits of 'uuid', for 0 < 'n' <= 8.
+ *
+ * This is useful for displaying a few leading digits of the uuid, e.g. to
+ * display 4 digits:
+ *     printf("%04x", uuid_prefix(uuid, 4));
+ */
+static inline unsigned int
+uuid_prefix(const struct uuid *uuid, int digits)
+{
+    return (uuid->parts[0] >> (32 - 4 * digits));
+}
+
 void uuid_init(void);
 void uuid_generate(struct uuid *);
 struct uuid uuid_random(void);
diff --git a/ovn/controller/ovn-controller.c b/ovn/controller/ovn-controller.c
index a935a791c0cd..1749a62a1a92 100644
--- a/ovn/controller/ovn-controller.c
+++ b/ovn/controller/ovn-controller.c
@@ -615,6 +615,7 @@  main(int argc, char *argv[])
     char *ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl);
     struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
         ovsdb_idl_create(ovnsb_remote, &sbrec_idl_class, true, true));
+    ovsdb_idl_set_leader_only(ovnsb_idl_loop.idl, false);
 
     create_ovnsb_indexes(ovnsb_idl_loop.idl);
     lport_init(ovnsb_idl_loop.idl);
diff --git a/ovsdb/TODO.rst b/ovsdb/TODO.rst
new file mode 100644
index 000000000000..ecb4b8c7838c
--- /dev/null
+++ b/ovsdb/TODO.rst
@@ -0,0 +1,64 @@ 
+..
+      Licensed under the Apache License, Version 2.0 (the "License"); you may
+      not use this file except in compliance with the License. You may obtain
+      a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+      WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+      License for the specific language governing permissions and limitations
+      under the License.
+
+      Convention for heading levels in Open vSwitch documentation:
+
+      =======  Heading 0 (reserved for the title in a document)
+      -------  Heading 1
+      ~~~~~~~  Heading 2
+      +++++++  Heading 3
+      '''''''  Heading 4
+
+      Avoid deeper levels because they do not render well.
+
+===========================
+OVSDB Clustering To-do List
+===========================
+
+* Minimal requirements for features and bug fixes:
+
+  * Figure out how ephemeral columns should be handled.
+
+  * Anti-rollback: client won't accept an older version.
+
+  * Causality: server receives data update and sends monitor update before it
+    responds to an RPC.
+
+  * Security for "convert".
+
+  * ovsdb-client handling of clusters
+
+  * Locks.
+
+  * ovsdb-idl should listen for monitor_canceled and restart monitoring,
+  or at least transition to S_ERROR, reconnect, etc.
+
+  * Investigate 100% CPU for long-running triggers
+
+  * Tons of unit tests.
+
+* Documentation:
+
+  * Upgrading OVN to a clustered database
+
+  * Installing OVN with a clustered database
+
+  * Overall diagram explaining the cluster and ovsdb protocol pieces
+
+  * Move OVSDB protocol stuff to ovsdb-server.7?
+
+* Future work:
+
+  * File format with diff support. 
+
+  * Future work: DNS or directory support
diff --git a/ovsdb/_server.ovsschema b/ovsdb/_server.ovsschema
index 8997bae5fa36..6fcd20cb02bd 100644
--- a/ovsdb/_server.ovsschema
+++ b/ovsdb/_server.ovsschema
@@ -1,9 +1,19 @@ 
 {"name": "_Server",
- "version": "1.0.0",
- "cksum": "3931859656 185",
+ "version": "1.1.0",
+ "cksum": "95782815 605",
  "tables": {
    "Database": {
      "columns": {
        "name": {"type": "string"},
-       "schema": {"type": "string"}},
+       "model": {
+         "type": {"key": {"type": "string",
+	                  "enum": ["set", ["standalone", "clustered"]]}}},
+       "connected": {"type": "boolean"},
+       "leader": {"type": "boolean"},
+       "schema": {
+         "type": {"key": {"type": "string"}, "min": 0, "max": 1}},
+       "cid": {
+         "type": {"key": {"type": "uuid"}, "min": 0, "max": 1}},
+       "sid": {
+         "type": {"key": {"type": "uuid"}, "min": 0, "max": 1}}},
      "isRoot": true}}}
diff --git a/ovsdb/_server.xml b/ovsdb/_server.xml
index 8ef782fb97b2..f1f8916ca563 100644
--- a/ovsdb/_server.xml
+++ b/ovsdb/_server.xml
@@ -58,8 +58,48 @@ 
       The database's name, as specified in its schema.
     </column>
 
+    <column name="model">
+      The storage model: <code>standalone</code> for a standalone or
+      active-backup database, <code>clustered</code> for a clustered database.
+    </column>
+
     <column name="schema">
-      The database schema, as a JSON string.
+      The database schema, as a JSON string.  Until a clustered database
+      finishes joining its cluster, this is empty.
     </column>
+
+    <group title="Clustered Databases">
+      <p>
+        These columns are most interesting and in some cases only relevant for
+        clustered databases, that is, those where the <ref column="model"/>
+        column is <code>clustered</code>.
+      </p>
+
+      <column name="connected">
+        True if the database is connected to its storage.  A standalone or
+        active-backup database is always connected.  A clustered database is
+        connected if the server is in contact with a majority of its cluster.
+        An unconnected database cannot be modified and its data might be
+        unavailable or stale.
+      </column>
+
+      <column name="leader">
+        True if the database is the leader in its cluster.  For a standalone or
+        active-backup database, this is always true.
+      </column>
+
+      <column name="cid">
+        The cluster ID for this database, which is the same for all of the
+        servers that host this particular clustered database.  For a standalone
+        or active-backup database, this is empty.
+      </column>
+
+      <column name="sid">
+        The server ID for this database, different for each server that hosts a
+        particular clustered database.  A server that hosts more than one
+        clustered database will have a different <code>sid</code> in each one.
+        For a standalone or active-backup database, this is empty.
+      </column>
+    </group>
   </table>
 </database>
diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk
index c6490288bc4b..6a414273e912 100644
--- a/ovsdb/automake.mk
+++ b/ovsdb/automake.mk
@@ -29,6 +29,12 @@  ovsdb_libovsdb_la_SOURCES = \
 	ovsdb/monitor.h \
 	ovsdb/query.c \
 	ovsdb/query.h \
+	ovsdb/raft.c \
+	ovsdb/raft.h \
+	ovsdb/raft-private.c \
+	ovsdb/raft-private.h \
+	ovsdb/raft-rpc.c \
+	ovsdb/raft-rpc.h \
 	ovsdb/rbac.c \
 	ovsdb/rbac.h \
 	ovsdb/replication.c \
@@ -37,6 +43,8 @@  ovsdb_libovsdb_la_SOURCES = \
 	ovsdb/row.h \
 	ovsdb/server.c \
 	ovsdb/server.h \
+	ovsdb/storage.c \
+	ovsdb/storage.h \
 	ovsdb/table.c \
 	ovsdb/table.h \
 	ovsdb/trigger.c \
@@ -140,3 +148,5 @@  ovsdb/ovsdb-server.5: \
 		$(srcdir)/ovsdb/_server.ovsschema \
 		$(srcdir)/ovsdb/_server.xml > $@.tmp && \
 	mv $@.tmp $@
+
+EXTRA_DIST += ovsdb/TODO.rst
diff --git a/ovsdb/execution.c b/ovsdb/execution.c
index 806d65690fc3..3c79919e24f3 100644
--- a/ovsdb/execution.c
+++ b/ovsdb/execution.c
@@ -1,4 +1,4 @@ 
-/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@ 
 
 #include <config.h>
 
+#include "ovsdb.h"
+
 #include <limits.h>
 
 #include "column.h"
@@ -25,7 +27,6 @@ 
 #include "ovsdb-data.h"
 #include "ovsdb-error.h"
 #include "ovsdb-parser.h"
-#include "ovsdb.h"
 #include "query.h"
 #include "rbac.h"
 #include "row.h"
@@ -97,11 +98,20 @@  lookup_executor(const char *name, bool *read_only)
     return NULL;
 }
 
-struct json *
-ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session,
-              const struct json *params, bool read_only,
-              const char *role, const char *id,
-              long long int elapsed_msec, long long int *timeout_msec)
+/* On success, returns a transaction and stores the results to return to the
+ * client in '*resultsp'.
+ *
+ * On failure, returns NULL.  If '*resultsp' is nonnull, then it is the results
+ * to return to the client.  If '*resultsp' is null, then the execution failed
+ * due to an unsatisfied "wait" operation and '*timeout_msec' is the time at
+ * which the transaction will time out.  (If 'timeout_msec' is null, this case
+ * never occurs--instead, an unsatisfied "wait" unconditionally fails.) */
+struct ovsdb_txn *
+ovsdb_execute_compose(struct ovsdb *db, const struct ovsdb_session *session,
+                      const struct json *params, bool read_only,
+                      const char *role, const char *id,
+                      long long int elapsed_msec, long long int *timeout_msec,
+                      bool *durable, struct json **resultsp)
 {
     struct ovsdb_execution x;
     struct ovsdb_error *error;
@@ -109,6 +119,7 @@  ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session,
     size_t n_operations;
     size_t i;
 
+    *durable = false;
     if (params->type != JSON_ARRAY
         || !params->u.array.n
         || params->u.array.elems[0]->type != JSON_STRING
@@ -120,7 +131,8 @@  ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session,
                                        "as first parameter");
         }
 
-        return ovsdb_error_to_json_free(error);
+        *resultsp = ovsdb_error_to_json_free(error);
+        return NULL;
     }
 
     x.db = db;
@@ -188,43 +200,56 @@  ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session,
         }
         if (error) {
             json_destroy(result);
-            result = ovsdb_error_to_json(error);
-        }
-        if (error && !strcmp(ovsdb_error_get_tag(error), "not supported")
-            && timeout_msec) {
-            ovsdb_txn_abort(x.txn);
-            *timeout_msec = x.timeout_msec;
-
-            json_destroy(result);
-            json_destroy(results);
-            results = NULL;
-            goto exit;
-        }
-
-        /* Add result to array. */
-        json_array_add(results, result);
-        if (error) {
-            break;
-        }
-    }
-
-    if (!error) {
-        error = ovsdb_txn_commit(x.txn, x.durable);
-        if (error) {
             json_array_add(results, ovsdb_error_to_json(error));
+            if (!strcmp(ovsdb_error_get_tag(error), "not supported")
+                && timeout_msec) {
+                *timeout_msec = x.timeout_msec;
+                json_destroy(results);
+                results = NULL;
+                goto exit;
+            }
+            break;
         }
-    } else {
-        ovsdb_txn_abort(x.txn);
+        json_array_add(results, result);
     }
-
     while (json_array(results)->n < n_operations) {
         json_array_add(results, json_null_create());
     }
 
 exit:
-    ovsdb_error_destroy(error);
+    if (error) {
+        ovsdb_txn_abort(x.txn);
+        x.txn = NULL;
+
+        ovsdb_error_destroy(error);
+    }
+    *resultsp = results;
+    *durable = x.durable;
     ovsdb_symbol_table_destroy(x.symtab);
 
+    return x.txn;
+}
+
+struct json *
+ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session,
+              const struct json *params, bool read_only,
+              const char *role, const char *id,
+              long long int elapsed_msec, long long int *timeout_msec)
+{
+    bool durable;
+    struct json *results;
+    struct ovsdb_txn *txn = ovsdb_execute_compose(
+        db, session, params, read_only, role, id, elapsed_msec, timeout_msec,
+        &durable, &results);
+    if (!txn) {
+        return results;
+    }
+
+    struct ovsdb_error *error = ovsdb_txn_propose_commit_block(txn, durable);
+    if (error) {
+        json_array_add(results, ovsdb_error_to_json(error));
+        ovsdb_error_destroy(error);
+    }
     return results;
 }
 
diff --git a/ovsdb/file.c b/ovsdb/file.c
index d33bce83a1ea..04025ba85ea5 100644
--- a/ovsdb/file.c
+++ b/ovsdb/file.c
@@ -30,6 +30,7 @@ 
 #include "ovsdb-error.h"
 #include "row.h"
 #include "socket-util.h"
+#include "storage.h"
 #include "table.h"
 #include "timeval.h"
 #include "transaction.h"
@@ -39,13 +40,6 @@ 
 
 VLOG_DEFINE_THIS_MODULE(ovsdb_file);
 
-/* Minimum number of milliseconds between database compactions. */
-#define COMPACT_MIN_MSEC        (10 * 60 * 1000) /* 10 minutes. */
-
-/* Minimum number of milliseconds between trying to compact the database if
- * compacting fails. */
-#define COMPACT_RETRY_MSEC      (60 * 1000)      /* 1 minute. */
-
 /* A transaction being converted to JSON for writing to a file. */
 struct ovsdb_file_txn {
     struct json *json;          /* JSON for the whole transaction. */
@@ -58,218 +52,6 @@  static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *,
                                    const struct ovsdb_row *old,
                                    const struct ovsdb_row *new,
                                    const unsigned long int *changed);
-static struct ovsdb_error *ovsdb_file_txn_commit(struct json *,
-                                                 const char *comment,
-                                                 bool durable,
-                                                 struct ovsdb_log *);
-
-static struct ovsdb_error *ovsdb_file_open__(const char *file_name,
-                                             const struct ovsdb_schema *,
-                                             bool read_only, struct ovsdb **,
-                                             struct ovsdb_file **);
-static struct ovsdb_error *ovsdb_file_txn_from_json(
-    struct ovsdb *, const struct json *, bool converting, struct ovsdb_txn **);
-static struct ovsdb_error *ovsdb_file_create(struct ovsdb *,
-                                             struct ovsdb_log *,
-                                             const char *file_name,
-                                             unsigned int n_transactions,
-                                             off_t snapshot_size,
-                                             struct ovsdb_file **filep);
-
-/* Opens database 'file_name' and stores a pointer to the new database in
- * '*dbp'.  If 'read_only' is false, then the database will be locked and
- * changes to the database will be written to disk.  If 'read_only' is true,
- * the database will not be locked and changes to the database will persist
- * only as long as the "struct ovsdb".
- *
- * If 'filep' is nonnull and 'read_only' is false, then on success sets
- * '*filep' to an ovsdb_file that represents the open file.  This ovsdb_file
- * persists until '*dbp' is destroyed.
- *
- * On success, returns NULL.  On failure, returns an ovsdb_error (which the
- * caller must destroy) and sets '*dbp' and '*filep' to NULL. */
-struct ovsdb_error *
-ovsdb_file_open(const char *file_name, bool read_only,
-                struct ovsdb **dbp, struct ovsdb_file **filep)
-{
-    return ovsdb_file_open__(file_name, NULL, read_only, dbp, filep);
-}
-
-/* Opens database 'file_name' with an alternate schema.  The specified 'schema'
- * is used to interpret the data in 'file_name', ignoring the schema actually
- * stored in the file.  Data in the file for tables or columns that do not
- * exist in 'schema' are ignored, but the ovsdb file format must otherwise be
- * observed, including column constraints.
- *
- * This function can be useful for upgrading or downgrading databases to
- * "almost-compatible" formats.
- *
- * The database will not be locked.  Changes to the database will persist only
- * as long as the "struct ovsdb".
- *
- * On success, stores a pointer to the new database in '*dbp' and returns a
- * null pointer.  On failure, returns an ovsdb_error (which the caller must
- * destroy) and sets '*dbp' to NULL. */
-struct ovsdb_error *
-ovsdb_file_open_as_schema(const char *file_name,
-                          const struct ovsdb_schema *schema,
-                          struct ovsdb **dbp)
-{
-    return ovsdb_file_open__(file_name, schema, true, dbp, NULL);
-}
-
-static struct ovsdb_error *
-ovsdb_file_open_log(const char *file_name, enum ovsdb_log_open_mode open_mode,
-                    struct ovsdb_log **logp, struct ovsdb_schema **schemap)
-{
-    struct ovsdb_schema *schema = NULL;
-    struct ovsdb_log *log = NULL;
-    struct ovsdb_error *error;
-    struct json *json = NULL;
-
-    ovs_assert(logp || schemap);
-
-    error = ovsdb_log_open(file_name, OVSDB_MAGIC, open_mode, -1, &log);
-    if (error) {
-        goto error;
-    }
-
-    error = ovsdb_log_read(log, &json);
-    if (error) {
-        goto error;
-    } else if (!json) {
-        error = ovsdb_io_error(EOF, "%s: database file contains no schema",
-                               file_name);
-        goto error;
-    }
-
-    if (schemap) {
-        error = ovsdb_schema_from_json(json, &schema);
-        if (error) {
-            error = ovsdb_wrap_error(error,
-                                     "failed to parse \"%s\" as ovsdb schema",
-                                     file_name);
-            goto error;
-        }
-    }
-    json_destroy(json);
-
-    if (logp) {
-        *logp = log;
-    } else {
-        ovsdb_log_close(log);
-    }
-    if (schemap) {
-        *schemap = schema;
-    }
-    return NULL;
-
-error:
-    ovsdb_log_close(log);
-    json_destroy(json);
-    if (logp) {
-        *logp = NULL;
-    }
-    if (schemap) {
-        *schemap = NULL;
-    }
-    return error;
-}
-
-static struct ovsdb_error *
-ovsdb_file_open__(const char *file_name,
-                  const struct ovsdb_schema *alternate_schema,
-                  bool read_only, struct ovsdb **dbp,
-                  struct ovsdb_file **filep)
-{
-    enum ovsdb_log_open_mode open_mode;
-    struct ovsdb_schema *schema = NULL;
-    struct ovsdb_error *error;
-    struct ovsdb_log *log;
-    struct json *json;
-    struct ovsdb *db = NULL;
-
-    /* In read-only mode there is no ovsdb_file so 'filep' must be null. */
-    ovs_assert(!(read_only && filep));
-
-    open_mode = read_only ? OVSDB_LOG_READ_ONLY : OVSDB_LOG_READ_WRITE;
-    error = ovsdb_file_open_log(file_name, open_mode, &log,
-                                alternate_schema ? NULL : &schema);
-    if (error) {
-        goto error;
-    }
-
-    db = ovsdb_create(schema ? schema : ovsdb_schema_clone(alternate_schema));
-
-    /* When a log gets big, we compact it into a new log that initially has
-     * only a single transaction that represents the entire state of the
-     * database.  Thus, we consider the first transaction in the database to be
-     * the snapshot.  We measure its size to later influence the minimum log
-     * size before compacting again.
-     *
-     * The schema precedes the snapshot in the log; we could compensate for its
-     * size, but it's just not that important. */
-    off_t snapshot_size = 0;
-    unsigned int n_transactions = 0;
-    while ((error = ovsdb_log_read(log, &json)) == NULL && json) {
-        struct ovsdb_txn *txn;
-
-        error = ovsdb_file_txn_from_json(db, json, alternate_schema != NULL,
-                                         &txn);
-        json_destroy(json);
-        if (error) {
-            ovsdb_log_unread(log);
-            break;
-        }
-
-        n_transactions++;
-        error = ovsdb_txn_commit(txn, false);
-        if (error) {
-            ovsdb_log_unread(log);
-            break;
-        }
-
-        if (n_transactions == 1) {
-            snapshot_size = ovsdb_log_get_offset(log);
-        }
-    }
-    if (error) {
-        /* Log error but otherwise ignore it.  Probably the database just got
-         * truncated due to power failure etc. and we should use its current
-         * contents. */
-        char *msg = ovsdb_error_to_string_free(error);
-        VLOG_ERR("%s", msg);
-        free(msg);
-    }
-
-    if (!read_only) {
-        struct ovsdb_file *file;
-
-        error = ovsdb_file_create(db, log, file_name, n_transactions,
-                                  snapshot_size, &file);
-        if (error) {
-            goto error;
-        }
-        if (filep) {
-            *filep = file;
-        }
-        db->file = file;
-    } else {
-        ovsdb_log_close(log);
-    }
-
-    *dbp = db;
-    return NULL;
-
-error:
-    *dbp = NULL;
-    if (filep) {
-        *filep = NULL;
-    }
-    ovsdb_destroy(db);
-    ovsdb_log_close(log);
-    return error;
-}
 
 static struct ovsdb_error *
 ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting,
@@ -380,7 +162,7 @@  ovsdb_file_txn_table_from_json(struct ovsdb_txn *txn,
  * If 'converting' is true, then unknown table and column names are ignored
  * (which can ease upgrading and downgrading schemas); otherwise, they are
  * treated as errors. */
-static struct ovsdb_error *
+struct ovsdb_error *
 ovsdb_file_txn_from_json(struct ovsdb *db, const struct json *json,
                          bool converting, struct ovsdb_txn **txnp)
 {
@@ -428,126 +210,92 @@  error:
     return error;
 }
 
-static struct ovsdb_error *
-ovsdb_file_save_copy__(const char *file_name, int locking,
-                       const char *comment, const struct ovsdb *db,
-                       struct ovsdb_log **logp)
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_convert_table(struct ovsdb_txn *txn,
+                    const struct ovsdb_table *src_table,
+                    struct ovsdb_table *dst_table)
 {
-    const struct shash_node *node;
-    struct ovsdb_file_txn ftxn;
-    struct ovsdb_error *error;
-    struct ovsdb_log *log;
-    struct json *json;
-
-    error = ovsdb_log_open(file_name, OVSDB_MAGIC,
-                           OVSDB_LOG_CREATE, locking, &log);
-    if (error) {
-        return error;
-    }
+    const struct ovsdb_row *src_row;
+    HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) {
+        struct ovsdb_row *dst_row = ovsdb_row_create(dst_table);
+        *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row);
 
-    /* Write schema. */
-    json = ovsdb_schema_to_json(db->schema);
-    error = ovsdb_log_write(log, json);
-    json_destroy(json);
-    if (error) {
-        goto exit;
-    }
+        struct shash_node *node;
+        SHASH_FOR_EACH (node, &src_table->schema->columns) {
+            const struct ovsdb_column *src_column = node->data;
+            if (src_column->index == OVSDB_COL_UUID ||
+                src_column->index == OVSDB_COL_VERSION) {
+                continue;
+            }
 
-    /* Write data. */
-    ovsdb_file_txn_init(&ftxn);
-    SHASH_FOR_EACH (node, &db->tables) {
-        const struct ovsdb_table *table = node->data;
-        const struct ovsdb_row *row;
+            const struct ovsdb_column *dst_column
+                = shash_find_data(&dst_table->schema->columns,
+                                  src_column->name);
+            if (!dst_column) {
+                continue;
+            }
 
-        HMAP_FOR_EACH (row, hmap_node, &table->rows) {
-            ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL);
+            struct ovsdb_error *error = ovsdb_datum_convert(
+                &dst_row->fields[dst_column->index], &dst_column->type,
+                &src_row->fields[src_column->index], &src_column->type);
+            if (error) {
+                ovsdb_row_destroy(dst_row);
+                return error;
+            }
         }
-    }
-    error = ovsdb_file_txn_commit(ftxn.json, comment, true, log);
 
-exit:
-    if (logp) {
-        if (!error) {
-            *logp = log;
-            log = NULL;
-        } else {
-            *logp = NULL;
-        }
-    }
-    ovsdb_log_close(log);
-    if (error) {
-        remove(file_name);
+        ovsdb_txn_row_insert(txn, dst_row);
     }
-    return error;
+    return NULL;
 }
 
-/* Saves a snapshot of 'db''s current contents as 'file_name'.  If 'comment' is
- * nonnull, then it is added along with the data contents and can be viewed
- * with "ovsdb-tool show-log".
- *
- * 'locking' is passed along to ovsdb_log_open() untouched. */
-struct ovsdb_error *
-ovsdb_file_save_copy(const char *file_name, int locking,
-                     const char *comment, const struct ovsdb *db)
+/* Copies the data in 'src', converts it into the schema specified in
+ * 'new_schema', and puts it into a newly created, unbacked database, and
+ * stores a pointer to the new database in '*dstp'.  Returns null if
+ * successful, otherwise an error; on error, stores NULL in '*dstp'. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema,
+              struct ovsdb **dstp)
 {
-    return ovsdb_file_save_copy__(file_name, locking, comment, db, NULL);
-}
+    struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema),
+                                     ovsdb_storage_create_unbacked());
+    struct ovsdb_txn *txn = ovsdb_txn_create(dst);
+    struct ovsdb_error *error = NULL;
 
-/* Opens database 'file_name', reads its schema, and closes it.  On success,
- * stores the schema into '*schemap' and returns NULL; the caller then owns the
- * schema.  On failure, returns an ovsdb_error (which the caller must destroy)
- * and sets '*dbp' to NULL. */
-struct ovsdb_error *
-ovsdb_file_read_schema(const char *file_name, struct ovsdb_schema **schemap)
-{
-    ovs_assert(schemap != NULL);
-    return ovsdb_file_open_log(file_name, OVSDB_LOG_READ_ONLY, NULL, schemap);
-}
-
-struct ovsdb_file {
-    struct ovsdb *db;
-    struct ovsdb_log *log;
-    char *file_name;
-    long long int last_compact;
-    long long int next_compact;
-    unsigned int n_transactions;
-    off_t snapshot_size;
-};
+    struct shash_node *node;
+    SHASH_FOR_EACH (node, &src->tables) {
+        const char *table_name = node->name;
+        struct ovsdb_table *src_table = node->data;
+        struct ovsdb_table *dst_table = shash_find_data(&dst->tables,
+                                                        table_name);
+        if (!dst_table) {
+            continue;
+        }
 
-static struct ovsdb_error *
-ovsdb_file_create(struct ovsdb *db, struct ovsdb_log *log,
-                  const char *file_name,
-                  unsigned int n_transactions, off_t snapshot_size,
-                  struct ovsdb_file **filep)
-{
-    struct ovsdb_file *file;
-    char *deref_name;
-    char *abs_name;
-
-    /* Use the absolute name of the file because ovsdb-server opens its
-     * database before daemonize() chdirs to "/". */
-    deref_name = follow_symlinks(file_name);
-    abs_name = abs_file_name(NULL, deref_name);
-    free(deref_name);
-    if (!abs_name) {
-        *filep = NULL;
-        return ovsdb_io_error(0, "could not determine current "
-                              "working directory");
+        error = ovsdb_convert_table(txn, src_table, dst_table);
+        if (error) {
+            goto error;
+        }
     }
 
-    file = xmalloc(sizeof *file);
-    file->db = db;
-    file->log = log;
-    file->file_name = abs_name;
-    file->last_compact = time_msec();
-    file->next_compact = file->last_compact + COMPACT_MIN_MSEC;
-    file->snapshot_size = snapshot_size;
-    file->n_transactions = n_transactions;
+    error = ovsdb_txn_replay_commit(txn);
+    if (error) {
+        txn = NULL;            /* ovsdb_txn_replay_commit() already aborted. */
+        goto error;
+    }
 
-    *filep = file;
+    *dstp = dst;
     return NULL;
-}
 
+error:
+    ovsdb_destroy(dst);
+    if (txn) {
+        ovsdb_txn_abort(txn);
+    }
+    *dstp = NULL;
+    return error;
+}
+
 static bool
 ovsdb_file_change_cb(const struct ovsdb_row *old,
                      const struct ovsdb_row *new,
@@ -559,10 +307,30 @@  ovsdb_file_change_cb(const struct ovsdb_row *old,
     return true;
 }
 
+struct json *
+ovsdb_to_txn_json(const struct ovsdb *db, const char *comment)
+{
+    struct ovsdb_file_txn ftxn;
+
+    ovsdb_file_txn_init(&ftxn);
+
+    struct shash_node *node;
+    SHASH_FOR_EACH (node, &db->tables) {
+        const struct ovsdb_table *table = node->data;
+        const struct ovsdb_row *row;
+
+        HMAP_FOR_EACH (row, hmap_node, &table->rows) {
+            ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL);
+        }
+    }
+
+    return ovsdb_file_txn_annotate(ftxn.json, comment);
+}
+
 /* Returns 'txn' transformed into the JSON format that is used in OVSDB files.
  * (But the caller must use ovsdb_file_txn_annotate() to add the _comment the
  * _date members.)  If 'txn' doesn't actually change anything, returns NULL */
-static struct json *
+struct json *
 ovsdb_file_txn_to_json(const struct ovsdb_txn *txn)
 {
     struct ovsdb_file_txn ftxn;
@@ -584,196 +352,6 @@  ovsdb_file_txn_annotate(struct json *json, const char *comment)
     json_object_put(json, "_date", json_integer_create(time_wall_msec()));
     return json;
 }
-
-struct ovsdb_error *
-ovsdb_file_commit(struct ovsdb_file *file,
-                  const struct ovsdb_txn *txn, bool durable)
-{
-    struct json *txn_json = ovsdb_file_txn_to_json(txn);
-    if (!txn_json) {
-        /* Nothing to commit. */
-        return NULL;
-    }
-
-    struct ovsdb_error *error = ovsdb_file_txn_commit(
-        txn_json, ovsdb_txn_get_comment(txn), durable, file->log);
-    if (error) {
-        return error;
-    }
-    file->n_transactions++;
-
-    /* If it has been at least COMPACT_MIN_MSEC ms since the last time we
-     * compacted (or at least COMPACT_RETRY_MSEC ms since the last time we
-     * tried), and if there are at least 100 transactions in the database, and
-     * if the database is at least 10 MB, and the database is at least 4x the
-     * size of the previous snapshot, then compact the database. */
-    off_t log_size = ovsdb_log_get_offset(file->log);
-    if (time_msec() >= file->next_compact
-        && file->n_transactions >= 100
-        && log_size >= 10 * 1024 * 1024
-        && log_size / 4 >= file->snapshot_size)
-    {
-        error = ovsdb_file_compact(file);
-        if (error) {
-            char *s = ovsdb_error_to_string_free(error);
-            VLOG_WARN("%s: compacting database failed (%s), retrying in "
-                      "%d seconds",
-                      file->file_name, s, COMPACT_RETRY_MSEC / 1000);
-            free(s);
-
-            file->next_compact = time_msec() + COMPACT_RETRY_MSEC;
-        }
-    }
-
-    return NULL;
-}
-
-/* Rename 'old' to 'new', replacing 'new' if it exists.  Returns NULL if
- * successful, otherwise an ovsdb_error that the caller must destroy. */
-static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
-ovsdb_rename(const char *old, const char *new)
-{
-#ifdef _WIN32
-    int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING
-                            | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED)
-                 ? 0 : EACCES);
-#else
-    int error = rename(old, new) ? errno : 0;
-#endif
-
-    return (error
-            ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"",
-                             old, new)
-            : NULL);
-}
-
-struct ovsdb_error *
-ovsdb_file_compact(struct ovsdb_file *file)
-{
-    struct ovsdb_log *new_log = NULL;
-    struct lockfile *tmp_lock = NULL;
-    struct ovsdb_error *error;
-    char *tmp_name = NULL;
-    char *comment = NULL;
-    int retval;
-
-    comment = xasprintf("compacting database online "
-                        "(%.3f seconds old, %u transactions, %llu bytes)",
-                        (time_wall_msec() - file->last_compact) / 1000.0,
-                        file->n_transactions,
-                        (unsigned long long) ovsdb_log_get_offset(file->log));
-    VLOG_INFO("%s: %s", file->file_name, comment);
-
-    /* Commit the old version, so that we can be assured that we'll eventually
-     * have either the old or the new version. */
-    error = ovsdb_log_commit(file->log);
-    if (error) {
-        goto exit;
-    }
-
-    /* Lock temporary file. */
-    tmp_name = xasprintf("%s.tmp", file->file_name);
-    retval = lockfile_lock(tmp_name, &tmp_lock);
-    if (retval) {
-        error = ovsdb_io_error(retval, "could not get lock on %s", tmp_name);
-        goto exit;
-    }
-
-    /* Remove temporary file.  (It might not exist.) */
-    if (unlink(tmp_name) < 0 && errno != ENOENT) {
-        error = ovsdb_io_error(errno, "failed to remove %s", tmp_name);
-        goto exit;
-    }
-
-    /* Save a copy. */
-    error = ovsdb_file_save_copy__(tmp_name, false, comment, file->db,
-                                   &new_log);
-    if (error) {
-        goto exit;
-    }
-
-    /* Replace original file by the temporary file.
-     *
-     * We support two strategies:
-     *
-     *     - The preferred strategy is to rename the temporary file over the
-     *       original one in-place, then close the original one.  This works on
-     *       Unix-like systems.  It does not work on Windows, which does not
-     *       allow open files to be renamed.  The approach has the advantage
-     *       that, at any point, we can drop back to something that already
-     *       works.
-     *
-     *     - Alternatively, we can close both files, rename, then open the new
-     *       file (which now has the original name).  This works on all
-     *       systems, but if reopening the file fails then we're stuck and have
-     *       to abort (XXX although it would be better to retry).
-     *
-     * We make the strategy a variable instead of an #ifdef to make it easier
-     * to test both strategies on Unix-like systems, and to make the code
-     * easier to read. */
-#ifdef _WIN32
-    bool rename_open_files = false;
-#else
-    bool rename_open_files = true;
-#endif
-    if (!rename_open_files) {
-        ovsdb_log_close(file->log);
-        ovsdb_log_close(new_log);
-        file->log = NULL;
-        new_log = NULL;
-    }
-    error = ovsdb_rename(tmp_name, file->file_name);
-    if (error) {
-        goto exit;
-    }
-    if (rename_open_files) {
-        fsync_parent_dir(file->file_name);
-        ovsdb_log_close(file->log);
-        file->log = new_log;
-    } else {
-        /* Re-open the log.  This skips past the schema log record. */
-        error = ovsdb_file_open_log(file->file_name, OVSDB_LOG_READ_WRITE,
-                                    &file->log, NULL);
-        if (error) {
-            ovs_fatal(0, "could not reopen database");
-        }
-
-        /* Skip past the data log reecord. */
-        struct json *json;
-        error = ovsdb_log_read(file->log, &json);
-        if (error) {
-            ovs_fatal(0, "error reading database");
-        }
-        json_destroy(json);
-    }
-
-    /* Success! */
-    file->last_compact = time_msec();
-    file->next_compact = file->last_compact + COMPACT_MIN_MSEC;
-    file->n_transactions = 1;
-
-exit:
-    if (error) {
-        ovsdb_log_close(new_log);
-        if (tmp_lock) {
-            unlink(tmp_name);
-        }
-    }
-
-    lockfile_unlock(tmp_lock);
-    free(tmp_name);
-    free(comment);
-
-    return error;
-}
-
-void
-ovsdb_file_destroy(struct ovsdb_file *file)
-{
-    ovsdb_log_close(file->log);
-    free(file->file_name);
-    free(file);
-}
 
 static void
 ovsdb_file_txn_init(struct ovsdb_file_txn *ftxn)
@@ -838,138 +416,3 @@  ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn,
         json_object_put(ftxn->table_json, uuid, row);
     }
 }
-
-static struct ovsdb_error *
-ovsdb_file_txn_commit(struct json *json, const char *comment,
-                      bool durable, struct ovsdb_log *log)
-{
-    struct ovsdb_error *error;
-
-    json = ovsdb_file_txn_annotate(json, comment);
-    error = ovsdb_log_write(log, json);
-    json_destroy(json);
-    if (error) {
-        return ovsdb_wrap_error(error, "writing transaction failed");
-    }
-
-    if (durable) {
-        error = ovsdb_log_commit(log);
-        if (error) {
-            return ovsdb_wrap_error(error, "committing transaction failed");
-        }
-    }
-
-    return NULL;
-}
-
-static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
-ovsdb_convert_table(struct ovsdb_txn *txn,
-                    const struct ovsdb_table *src_table,
-                    struct ovsdb_table *dst_table)
-{
-    const struct ovsdb_row *src_row;
-    HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) {
-        struct ovsdb_row *dst_row = ovsdb_row_create(dst_table);
-        *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row);
-
-        struct shash_node *node;
-        SHASH_FOR_EACH (node, &src_table->schema->columns) {
-            const struct ovsdb_column *src_column = node->data;
-            if (src_column->index == OVSDB_COL_UUID ||
-                src_column->index == OVSDB_COL_VERSION) {
-                continue;
-            }
-
-            const struct ovsdb_column *dst_column
-                = shash_find_data(&dst_table->schema->columns,
-                                  src_column->name);
-            if (!dst_column) {
-                continue;
-            }
-
-            struct ovsdb_error *error = ovsdb_datum_convert(
-                &dst_row->fields[dst_column->index], &dst_column->type,
-                &src_row->fields[src_column->index], &src_column->type);
-            if (error) {
-                ovsdb_row_destroy(dst_row);
-                return error;
-            }
-        }
-
-        ovsdb_txn_row_insert(txn, dst_row);
-    }
-    return NULL;
-}
-
-struct ovsdb_error * OVS_WARN_UNUSED_RESULT
-ovsdb_file_convert(const struct ovsdb_file *file,
-                   const struct ovsdb_schema *new_schema)
-{
-    struct ovsdb *new_db = ovsdb_create(ovsdb_schema_clone(new_schema));
-    struct ovsdb_txn *txn = ovsdb_txn_create(new_db);
-    struct ovsdb_error *error = NULL;
-
-    struct shash_node *node;
-    SHASH_FOR_EACH (node, &file->db->tables) {
-        const char *table_name = node->name;
-        const struct ovsdb_table *src_table = node->data;
-        struct ovsdb_table *dst_table = shash_find_data(&new_db->tables,
-                                                        table_name);
-        if (!dst_table) {
-            continue;
-        }
-
-        error = ovsdb_convert_table(txn, src_table, dst_table);
-        if (error) {
-            goto error;
-        }
-    }
-
-    error = ovsdb_txn_start_commit(txn);
-    if (error) {
-        goto error;
-    }
-
-    struct ovsdb_log *new;
-    error = ovsdb_log_replace_start(file->log, &new);
-    if (error) {
-        goto error;
-    }
-
-    /* Write schema. */
-    struct json *schema_json = ovsdb_schema_to_json(new_schema);
-    error = ovsdb_log_write(new, schema_json);
-    json_destroy(schema_json);
-    if (error) {
-        goto error;
-    }
-
-    /* Write data. */
-    struct json *txn_json = ovsdb_file_txn_to_json(txn);
-    if (txn_json) {
-        error = ovsdb_log_write(new, txn_json);
-        json_destroy(txn_json);
-        if (error) {
-            goto error;
-        }
-    }
-
-    error = ovsdb_log_replace_commit(file->log, new);
-    if (error) {
-        goto error;
-    }
-
-    error = ovsdb_txn_finish_commit(txn, true);
-    ovs_assert(!error);         /* Can't happen. */
-
-    ovsdb_replace(file->db, new_db);
-
-    return NULL;
-
-error:
-    ovsdb_destroy(new_db);
-    if (txn) {
-        ovsdb_txn_abort(txn);
-    }
-    return error;
-}
diff --git a/ovsdb/file.h b/ovsdb/file.h
index bc9b32cf6c33..b57412e7cb58 100644
--- a/ovsdb/file.h
+++ b/ovsdb/file.h
@@ -18,41 +18,23 @@ 
 
 #include <stdbool.h>
 #include "compiler.h"
-#include "log.h"
 
 struct ovsdb;
-struct ovsdb_file;
 struct ovsdb_schema;
 struct ovsdb_txn;
 
-struct ovsdb_error *ovsdb_file_open(const char *file_name, bool read_only,
-                                    struct ovsdb **, struct ovsdb_file **)
-    OVS_WARN_UNUSED_RESULT;
-
-struct ovsdb_error *ovsdb_file_open_as_schema(const char *file_name,
-                                              const struct ovsdb_schema *,
-                                              struct ovsdb **)
-    OVS_WARN_UNUSED_RESULT;
-
-struct ovsdb_error *ovsdb_file_save_copy(const char *file_name, int locking,
-                                         const char *comment,
-                                         const struct ovsdb *)
-    OVS_WARN_UNUSED_RESULT;
-
-struct ovsdb_error *ovsdb_file_compact(struct ovsdb_file *);
-
-struct ovsdb_error *ovsdb_file_read_schema(const char *file_name,
-                                           struct ovsdb_schema **)
-    OVS_WARN_UNUSED_RESULT;
-
-struct ovsdb_error *ovsdb_file_commit(struct ovsdb_file *,
-                                      const struct ovsdb_txn *, bool durable);
-void ovsdb_file_destroy(struct ovsdb_file *);
-
+struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment);
+struct json *ovsdb_file_txn_to_json(const struct ovsdb_txn *);
 struct json *ovsdb_file_txn_annotate(struct json *, const char *comment);
+struct ovsdb_error *ovsdb_file_txn_from_json(struct ovsdb *,
+                                             const struct json *,
+                                             bool converting,
+                                             struct ovsdb_txn **)
+    OVS_WARN_UNUSED_RESULT;
 
-struct ovsdb_error *ovsdb_file_convert(const struct ovsdb_file *,
-                                       const struct ovsdb_schema *)
+struct ovsdb_error *ovsdb_convert(const struct ovsdb *src,
+                                  const struct ovsdb_schema *new_schema,
+                                  struct ovsdb **dstp)
     OVS_WARN_UNUSED_RESULT;
 
 #endif /* ovsdb/file.h */
diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c
index 8b2a4648bb1d..34dac76ecb5d 100644
--- a/ovsdb/jsonrpc-server.c
+++ b/ovsdb/jsonrpc-server.c
@@ -770,6 +770,15 @@  ovsdb_jsonrpc_lookup_db(const struct ovsdb_jsonrpc_session *s,
         goto error;
     }
 
+    if (!db->schema) {
+        error = ovsdb_error("database not available",
+                            "%s request specifies database %s which is not "
+                            "yet available because it has not completed "
+                            "joining its cluster",
+                            request->method, db_name);
+        goto error;
+    }
+
     *replyp = NULL;
     return db;
 
diff --git a/ovsdb/log.c b/ovsdb/log.c
index 7f05fb083246..aec72c77c121 100644
--- a/ovsdb/log.c
+++ b/ovsdb/log.c
@@ -42,6 +42,7 @@  enum ovsdb_log_mode {
 struct ovsdb_log {
     off_t prev_offset;
     off_t offset;
+    char *rel_name;
     char *name;
     char *magic;
     struct lockfile *lockfile;
@@ -49,15 +50,22 @@  struct ovsdb_log {
     struct ovsdb_error *read_error;
     bool write_error;
     enum ovsdb_log_mode mode;
+    off_t base;
 };
 
+static bool parse_header(char *header, const char **magicp,
+                         unsigned long int *length,
+                         uint8_t sha1[SHA1_DIGEST_SIZE]);
+static bool is_magic_ok(const char *needle, const char *haystack);
+
 /* Attempts to open 'name' with the specified 'open_mode'.  On success, stores
  * the new log into '*filep' and returns NULL; otherwise returns NULL and
  * stores NULL into '*filep'.
  *
  * 'magic' is a short text string put at the beginning of every record and used
  * to distinguish one kind of log file from another.  For a conventional OVSDB
- * log file, use OVSDB_MAGIC.
+ * log file, use OVSDB_MAGIC.  To accept more than one magic string, separate
+ * them with "|", e.g. "MAGIC 1|MAGIC 2".
  *
  * Whether the file will be locked using lockfile_lock() depends on 'locking':
  * use true to lock it, false not to lock it, or -1 to lock it only if
@@ -70,20 +78,36 @@  ovsdb_log_open(const char *name, const char *magic,
 {
     struct lockfile *lockfile;
     struct ovsdb_error *error;
-    struct ovsdb_log *file;
-    struct stat s;
+    char *abs_name;
     FILE *stream;
     int flags;
     int fd;
 
+    /* If we can create a new file, we need to know what kind of magic to
+     * use, so there must be only one kind. */
+    if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) {
+        ovs_assert(!strchr(magic, '|'));
+    }
+
     *filep = NULL;
 
+    /* Use the absolute name of the file because ovsdb-server opens its
+     * database before daemonize() chdirs to "/". */
+    char *deref_name = follow_symlinks(name);
+    abs_name = abs_file_name(NULL, deref_name);
+    free(deref_name);
+    if (!name) {
+        error = ovsdb_io_error(0, "could not determine current "
+                              "working directory");
+        goto error;
+    }
+
     ovs_assert(locking == -1 || locking == false || locking == true);
     if (locking < 0) {
         locking = open_mode != OVSDB_LOG_READ_ONLY;
     }
     if (locking) {
-        int retval = lockfile_lock(name, &lockfile);
+        int retval = lockfile_lock(abs_name, &lockfile);
         if (retval) {
             error = ovsdb_io_error(retval, "%s: failed to lock lockfile",
                                    name);
@@ -118,10 +142,10 @@  ovsdb_log_open(const char *name, const char *magic,
 #endif
     /* Special case for /dev/stdin to make it work even if the operating system
      * doesn't support it under that name. */
-    if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
+    if (!strcmp(abs_name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
         fd = dup(STDIN_FILENO);
     } else {
-        fd = open(name, flags, 0666);
+        fd = open(abs_name, flags, 0666);
     }
     if (fd < 0) {
         const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create"
@@ -131,41 +155,53 @@  ovsdb_log_open(const char *name, const char *magic,
         goto error_unlock;
     }
 
-    if (!fstat(fd, &s)) {
-        if (s.st_size == 0) {
-            /* It's (probably) a new file so fsync() its parent directory to
-             * ensure that its directory entry is committed to disk. */
-            fsync_parent_dir(name);
-        } else if (s.st_size >= strlen(magic) && S_ISREG(s.st_mode)) {
-            /* Try to read the magic from the first log record.  If it's not
-             * the magic we expect, this is the wrong kind of file, so reject
-             * it immediately. */
-            size_t magic_len = strlen(magic);
-            char *buf = xzalloc(magic_len + 1);
-            bool err = (read(fd, buf, magic_len) == magic_len
-                        && strcmp(buf, magic));
-            free(buf);
-            if (err) {
-                error = ovsdb_error(NULL, "%s: bad magic (unexpected "
-                                    "kind of file)", name);
-                goto error_close;
-            }
-            if (lseek(fd, 0, SEEK_SET)) {
-                error = ovsdb_io_error(errno, "%s: seek failed", name);
-                goto error_close;
-            }
-        }
-    }
-
     stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b");
     if (!stream) {
         error = ovsdb_io_error(errno, "%s: fdopen failed", name);
-        goto error_close;
+        close(fd);
+        goto error_unlock;
+    }
+
+    /* Read the magic from the first log record. */
+    char header[128];
+    const char *actual_magic;
+    if (!fgets(header, sizeof header, stream)) {
+        if (ferror(stream)) {
+            error = ovsdb_io_error(errno, "%s: read error", name);
+            goto error_fclose;
+        }
+
+        /* We need to be able to report what kind of file this is but we can't
+         * if it's empty and we accept more than one. */
+        if (strchr(magic, '|')) {
+            error = ovsdb_error(NULL, "%s: unexpected end of file", name);
+            goto error_fclose;
+        }
+        actual_magic = magic;
+
+        /* It's an empty file and therefore probably a new file, so fsync()
+         * its parent directory to ensure that its directory entry is
+         * committed to disk. */
+        fsync_parent_dir(abs_name);
+    } else {
+        unsigned long int length;
+        uint8_t sha1[SHA1_DIGEST_SIZE];
+        if (!parse_header(header, &actual_magic, &length, sha1)
+            || !is_magic_ok(actual_magic, magic)) {
+            error = ovsdb_error(NULL, "%s: unexpected file format", name);
+            goto error_fclose;
+        }
+    }
+
+    if (fseek(stream, 0, SEEK_SET)) {
+        error = ovsdb_io_error(errno, "%s: seek failed", name);
+        goto error_fclose;
     }
 
-    file = xmalloc(sizeof *file);
-    file->name = xstrdup(name);
-    file->magic = xstrdup(magic);
+    struct ovsdb_log *file = xmalloc(sizeof *file);
+    file->name = abs_name;
+    file->rel_name = xstrdup(name);
+    file->magic = xstrdup(actual_magic);
     file->lockfile = lockfile;
     file->stream = stream;
     file->prev_offset = 0;
@@ -173,22 +209,48 @@  ovsdb_log_open(const char *name, const char *magic,
     file->read_error = NULL;
     file->write_error = false;
     file->mode = OVSDB_LOG_READ;
+    file->base = 0;
     *filep = file;
     return NULL;
 
-error_close:
-    close(fd);
+error_fclose:
+    fclose(stream);
 error_unlock:
     lockfile_unlock(lockfile);
 error:
+    free(abs_name);
     return error;
 }
 
+/* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
+static bool
+is_magic_ok(const char *needle, const char *haystack)
+{
+    /* 'needle' can't be multiple words. */
+    if (strchr(needle, '|')) {
+        return false;
+    }
+
+    size_t n = strlen(needle);
+    for (;;) {
+        if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) {
+            return true;
+        }
+        haystack = strchr(haystack, '|');
+        if (!haystack) {
+            return false;
+        }
+        haystack++;
+    }
+}
+
 void
 ovsdb_log_close(struct ovsdb_log *file)
 {
     if (file) {
         free(file->name);
+        free(file->rel_name);
+        free(file->magic);
         if (file->stream) {
             fclose(file->stream);
         }
@@ -198,20 +260,34 @@  ovsdb_log_close(struct ovsdb_log *file)
     }
 }
 
+const char *
+ovsdb_log_get_magic(const struct ovsdb_log *log)
+{
+    return log->magic;
+}
+
 static bool
-parse_header(const char *magic, char *header, unsigned long int *length,
-             uint8_t sha1[SHA1_DIGEST_SIZE])
+parse_header(char *header, const char **magicp,
+             unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE])
 {
-    char *p;
+    /* 'header' must consist of "OVSDB "... */
+    const char lead[] = "OVSDB ";
+    if (strncmp(lead, header, strlen(lead))) {
+        return false;
+    }
 
-    /* 'header' must consist of a magic string... */
-    size_t magic_len = strlen(magic);
-    if (strncmp(header, magic, magic_len) || header[magic_len] != ' ') {
+    /* ...followed by a magic string... */
+    char *magic = header + strlen(lead);
+    size_t magic_len = strcspn(magic, " ");
+    if (magic[magic_len] != ' ') {
         return false;
     }
+    magic[magic_len] = '\0';
+    *magicp = magic;
 
     /* ...followed by a length in bytes... */
-    *length = strtoul(header + magic_len + 1, &p, 10);
+    char *p;
+    *length = strtoul(magic + magic_len + 1, &p, 10);
     if (!*length || *length == ULONG_MAX || *p != ' ') {
         return false;
     }
@@ -250,7 +326,7 @@  parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
             json_parser_abort(parser);
             return ovsdb_io_error(ferror(file->stream) ? errno : EOF,
                                   "%s: error reading %lu bytes "
-                                  "starting at offset %lld", file->name,
+                                  "starting at offset %lld", file->rel_name,
                                   length, (long long int) offset);
         }
         sha1_update(&ctx, input, chunk);
@@ -263,13 +339,22 @@  parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
     return NULL;
 }
 
+/* Attempts to read a log record from 'file'.
+ *
+ * If successful, returns NULL and stores in '*jsonp' the JSON object that the
+ * record contains.  The caller owns the data and must eventually free it (with
+ * json_destroy()).
+ *
+ * If a read error occurs, returns the error and stores NULL in '*jsonp'.
+ *
+ * If the read reaches end of file, returns NULL and stores NULL in
+ * '*jsonp'. */
 struct ovsdb_error *
 ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
 {
     uint8_t expected_sha1[SHA1_DIGEST_SIZE];
     uint8_t actual_sha1[SHA1_DIGEST_SIZE];
     struct ovsdb_error *error;
-    off_t data_offset;
     unsigned long data_length;
     struct json *json;
     char header[128];
@@ -279,27 +364,30 @@  ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
     if (file->read_error) {
         return ovsdb_error_clone(file->read_error);
     } else if (file->mode == OVSDB_LOG_WRITE) {
-        return OVSDB_BUG("reading file in write mode");
+        return NULL;
     }
 
     if (!fgets(header, sizeof header, file->stream)) {
         if (feof(file->stream)) {
             error = NULL;
         } else {
-            error = ovsdb_io_error(errno, "%s: read failed", file->name);
+            error = ovsdb_io_error(errno, "%s: read failed", file->rel_name);
         }
         goto error;
     }
+    off_t data_offset = file->offset + strlen(header);
 
-    if (!parse_header(file->magic, header, &data_length, expected_sha1)) {
+    const char *magic;
+    if (!parse_header(header, &magic, &data_length, expected_sha1)
+        || strcmp(magic, file->magic)) {
         error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset "
                                    "%lld in header line \"%.*s\"",
-                                   file->name, (long long int) file->offset,
+                                   file->rel_name,
+                                   (long long int) file->offset,
                                    (int) strcspn(header, "\n"), header);
         goto error;
     }
 
-    data_offset = file->offset + strlen(header);
     error = parse_body(file, data_offset, data_length, actual_sha1, &json);
     if (error) {
         goto error;
@@ -309,7 +397,7 @@  ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
         error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
                                    "offset %lld have SHA-1 hash "SHA1_FMT" "
                                    "but should have hash "SHA1_FMT,
-                                   file->name, data_length,
+                                   file->rel_name, data_length,
                                    (long long int) data_offset,
                                    SHA1_ARGS(actual_sha1),
                                    SHA1_ARGS(expected_sha1));
@@ -319,7 +407,7 @@  ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
     if (json->type == JSON_STRING) {
         error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
                                    "offset %lld are not valid JSON (%s)",
-                                   file->name, data_length,
+                                   file->rel_name, data_length,
                                    (long long int) data_offset,
                                    json->u.string);
         goto error;
@@ -327,7 +415,7 @@  ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
     if (json->type != JSON_OBJECT) {
         error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
                                    "offset %lld are not a JSON object",
-                                   file->name, data_length,
+                                   file->rel_name, data_length,
                                    (long long int) data_offset);
         goto error;
     }
@@ -359,6 +447,23 @@  ovsdb_log_unread(struct ovsdb_log *file)
     file->offset = file->prev_offset;
 }
 
+static struct ovsdb_error *
+ovsdb_log_truncate(struct ovsdb_log *file)
+{
+    file->mode = OVSDB_LOG_WRITE;
+
+    struct ovsdb_error *error = NULL;
+    if (fseeko(file->stream, file->offset, SEEK_SET)) {
+        error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
+                               file->rel_name, (long long int) file->offset);
+    } else if (ftruncate(fileno(file->stream), file->offset)) {
+        error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
+                               file->rel_name, (long long int) file->offset);
+    }
+    file->write_error = error != NULL;
+    return error;
+}
+
 void
 ovsdb_log_compose_record(const struct json *json,
                          const char *magic, struct ds *header, struct ds *data)
@@ -375,33 +480,23 @@  ovsdb_log_compose_record(const struct json *json,
     /* Compose header. */
     uint8_t sha1[SHA1_DIGEST_SIZE];
     sha1_bytes(data->string, data->length, sha1);
-    ds_put_format(header, "%s %"PRIuSIZE" "SHA1_FMT"\n",
+    ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n",
                   magic, data->length, SHA1_ARGS(sha1));
 }
 
 struct ovsdb_error *
 ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
 {
-    struct ovsdb_error *error;
-
     if (file->mode == OVSDB_LOG_READ || file->write_error) {
-        file->mode = OVSDB_LOG_WRITE;
-        file->write_error = false;
-        if (fseeko(file->stream, file->offset, SEEK_SET)) {
-            error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
-                                   file->name, (long long int) file->offset);
-            goto error;
-        }
-        if (ftruncate(fileno(file->stream), file->offset)) {
-            error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
-                                   file->name, (long long int) file->offset);
-            goto error;
+        struct ovsdb_error *error = ovsdb_log_truncate(file);
+        if (error) {
+            file->write_error = true;
+            return error;
         }
     }
 
     if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) {
-        error = OVSDB_BUG("bad JSON type");
-        goto error;
+        return OVSDB_BUG("bad JSON type");
     }
 
     struct ds header = DS_EMPTY_INITIALIZER;
@@ -420,34 +515,33 @@  ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
          * nothing further we can do. */
         ignore(ftruncate(fileno(file->stream), file->offset));
 
-        error = ovsdb_io_error(errno, "%s: write failed", file->name);
-        goto error;
+        file->write_error = true;
+        return ovsdb_io_error(errno, "%s: write failed", file->rel_name);
     }
 
     file->offset += total_length;
     return NULL;
-
-error:
-    file->write_error = true;
-    return error;
 }
 
 struct ovsdb_error *
 ovsdb_log_commit(struct ovsdb_log *file)
 {
     if (fsync(fileno(file->stream))) {
-        return ovsdb_io_error(errno, "%s: fsync failed", file->name);
+        return ovsdb_io_error(errno, "%s: fsync failed", file->rel_name);
     }
     return NULL;
 }
 
-/* Returns the current offset into the file backing 'log', in bytes.  This
- * reflects the number of bytes that have been read or written in the file.  If
- * the whole file has been read, this is the file size. */
-off_t
-ovsdb_log_get_offset(const struct ovsdb_log *log)
+void
+ovsdb_log_mark_base(struct ovsdb_log *log)
+{
+    log->base = log->offset;
+}
+
+bool
+ovsdb_log_has_grown(const struct ovsdb_log *log)
 {
-    return log->offset;
+    return log->offset > 10 * 1024 * 1024 && log->offset / 4 > log->base;
 }
 
 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
@@ -468,6 +562,7 @@  ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n)
             return error;
         }
     }
+    ovsdb_log_mark_base(new);
 
     return ovsdb_log_replace_commit(log, new);
 }
@@ -476,7 +571,7 @@  struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 ovsdb_log_replace_start(struct ovsdb_log *old,
                         struct ovsdb_log **newp)
 {
-    char *tmp_name = xasprintf("%s.tmp", old->name);
+    char *tmp_name = xasprintf("%s.tmp", old->rel_name);
     struct ovsdb_error *error;
 
     ovs_assert(old->lockfile);
@@ -534,6 +629,7 @@  ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new)
     /* read_error only matters for OVSDB_LOG_READ. */
     old->write_error = new->write_error;
     old->mode = OVSDB_LOG_WRITE;
+    old->base = new->base;
 
     /* Free 'new'. */
     ovsdb_log_close(new);
diff --git a/ovsdb/log.h b/ovsdb/log.h
index 5be7eb91b165..d7345324ff03 100644
--- a/ovsdb/log.h
+++ b/ovsdb/log.h
@@ -1,4 +1,4 @@ 
-/* Copyright (c) 2009, 2010, 2011 Nicira, Inc.
+/* Copyright (c) 2009, 2010, 2011, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@  enum ovsdb_log_open_mode {
     OVSDB_LOG_CREATE            /* Create or open file, read/write. */
 };
 
-#define OVSDB_MAGIC "OVSDB JSON"
+#define OVSDB_MAGIC "JSON"
 
 struct ovsdb_error *ovsdb_log_open(const char *name, const char *magic,
                                    enum ovsdb_log_open_mode,
@@ -39,6 +39,8 @@  struct ovsdb_error *ovsdb_log_open(const char *name, const char *magic,
     OVS_WARN_UNUSED_RESULT;
 void ovsdb_log_close(struct ovsdb_log *);
 
+const char *ovsdb_log_get_magic(const struct ovsdb_log *);
+
 struct ovsdb_error *ovsdb_log_read(struct ovsdb_log *, struct json **)
     OVS_WARN_UNUSED_RESULT;
 void ovsdb_log_unread(struct ovsdb_log *);
@@ -51,7 +53,8 @@  struct ovsdb_error *ovsdb_log_write(struct ovsdb_log *, const struct json *)
 struct ovsdb_error *ovsdb_log_commit(struct ovsdb_log *)
     OVS_WARN_UNUSED_RESULT;
 
-off_t ovsdb_log_get_offset(const struct ovsdb_log *);
+void ovsdb_log_mark_base(struct ovsdb_log *);
+bool ovsdb_log_has_grown(const struct ovsdb_log *);
 
 struct ovsdb_error *ovsdb_log_replace(struct ovsdb_log *,
                                       struct json **entries, size_t n)
diff --git a/ovsdb/ovsdb-client.1.in b/ovsdb/ovsdb-client.1.in
index cd17467147da..fb803ba876a2 100644
--- a/ovsdb/ovsdb-client.1.in
+++ b/ovsdb/ovsdb-client.1.in
@@ -46,6 +46,8 @@  ovsdb\-client \- command-line interface to \fBovsdb-server\fR(1)
 .br
 \fBovsdb\-client \fR[\fIoptions\fR] \fBmonitor\-cond\fI \fR[\fIserver\fR] \fR[\fIdatabase\fR] \fIconditions
 \fItable\fR [\fIcolumn\fR[\fB,\fIcolumn\fR]...]...
+.br
+\fBovsdb\-client \fR[\fIoptions\fR] \fBwait\fR \fR[\fIserver\fR] \fIdatabase\fR \fIstate\fR
 .IP "Testing Commands:"
 \fBovsdb\-client \fR[\fIoptions\fR] \fBlock\fI \fR[\fIserver\fR] \fIlock\fR
 .br
@@ -215,8 +217,8 @@  single transaction.
 .IP
 UUIDs for rows in the restored database will differ from those in
 \fIsnapshot\fR, because the OVSDB protocol does not allow clients to
-specify row UUIDs.  Another way to restore a database,
-which does also restore row UUIDs, is to stop
+specify row UUIDs.  Another way to restore a standalone or active-backup
+database, which does also restore row UUIDs, is to stop
 the server or servers, replace the database file by the snapshot, then
 restart the database.  Either way, ephemeral columns are not restored,
 since by design they do not survive across restarts of
@@ -274,6 +276,27 @@  prints the initial database contents.
 The \fBmonitor\fR command uses RFC 7047 "monitor" method to open a monitor
 session with the server.
 .
+.IP "\fBwait\fR \fR[\fIserver\fR] \fIdatabase state\fR"
+Waits for \fIdatabase\fR on \fIserver\fR to enter a desired \fIstate\fR,
+which may be one of:
+.RS
+.IP "\fBadded\fR"
+Waits until a database with the given name has been added to
+\fIserver\fR.
+.IP "\fBconnected\fR"
+Waits until a database with the given name has been added to
+\fIserver\fR.  Then, if \fIdatabase\fR is clustered, additionally
+waits until it has joined and connected to its cluster.
+.IP "\fBremoved\fR"
+Waits until \fIdatabase\fR has been removed from the database server.
+This can also be used to wait for a database to complete leaving its
+cluster, because \fBovsdb\-server\fR removes a database at that point.
+.RE
+.IP
+\fIdatabase\fR is mandatory for this command because it is often used
+to check for databases that have not yet been added to the server, so
+that the \fBovsdb\-client\fR semantics of acting on a default database
+do not work.
 .SS "Testing commands"
 These commands are mostly of interest for testing the correctness
 of the OVSDB server.
diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
index c153a978385a..1324b596a6a2 100644
--- a/ovsdb/ovsdb-client.c
+++ b/ovsdb/ovsdb-client.c
@@ -44,9 +44,11 @@ 
 #include "row.h"
 #include "sort.h"
 #include "svec.h"
+#include "storage.h"
 #include "stream.h"
 #include "stream-ssl.h"
 #include "table.h"
+#include "transaction.h"
 #include "monitor.h"
 #include "condition.h"
 #include "timeval.h"
@@ -1647,6 +1649,89 @@  do_backup(struct jsonrpc *rpc, const char *database,
     jsonrpc_msg_destroy(reply);
 }
 
+static struct ovsdb_storage *
+open_standalone_db(const char *filename, bool rw)
+{
+    struct ovsdb_storage *storage;
+    check_ovsdb_error(ovsdb_storage_open(filename, rw, &storage));
+    if (ovsdb_storage_is_clustered(storage)) {
+        ovs_fatal(0, "%s: cannot use clustered database for this operation",
+                  filename);
+    }
+    return storage;
+}
+
+static struct ovsdb_schema *
+read_schema_from_storage(struct ovsdb_storage *storage)
+{
+    struct json *txn_json;
+    struct ovsdb_schema *schema;
+    check_ovsdb_error(ovsdb_storage_read(storage, &schema, &txn_json, NULL));
+    if (!schema && !txn_json) {
+        ovs_fatal(0, "unexpected end of file reading schema");
+    }
+    ovs_assert(schema && !txn_json);
+
+    return schema;
+}
+
+static struct json *
+read_txn_from_storage(struct ovsdb_storage *storage)
+{
+    struct json *txn_json;
+    struct ovsdb_schema *schema;
+    check_ovsdb_error(ovsdb_storage_read(storage, &schema, &txn_json, NULL));
+    ovs_assert(!schema);
+
+    return txn_json;
+}
+
+/* The new database takes ownership of the storage. */
+static struct ovsdb *
+read_ovsdb_txns(struct ovsdb_schema *schema, struct ovsdb_storage *storage,
+                bool converting)
+{
+    struct ovsdb *ovsdb = ovsdb_create(schema,
+                                       ovsdb_storage_create_unbacked());
+    for (;;) {
+        struct json *txn_json = read_txn_from_storage(storage);
+        if (!txn_json) {
+            break;
+        }
+
+        struct ovsdb_txn *txn;
+        check_ovsdb_error(ovsdb_file_txn_from_json(ovsdb, txn_json, converting,
+                                                   &txn));
+        json_destroy(txn_json);
+
+        struct ovsdb_error *error = ovsdb_txn_replay_commit(txn);
+        if (error) {
+            ovsdb_storage_unread(storage);
+            break;
+        }
+    }
+    return ovsdb;
+}
+
+static void
+check_transaction_reply(struct jsonrpc_msg *reply)
+{
+    if (reply->result->type != JSON_ARRAY) {
+        ovs_fatal(0, "result is not array");
+    }
+    for (size_t i = 0; i < json_array(reply->result)->n; i++) {
+        struct json *json = json_array(reply->result)->elems[i];
+        if (json->type != JSON_OBJECT) {
+            ovs_fatal(0, "result array element is not object");
+        }
+        struct shash *object = json_object(json);
+        if (shash_find(object, "error")) {
+            ovs_fatal(0, "server returned error reply: %s",
+                      json_to_string(json, JSSF_SORT));
+        }
+    }
+}
+
 static void
 do_restore(struct jsonrpc *rpc, const char *database,
            int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
@@ -1656,10 +1741,12 @@  do_restore(struct jsonrpc *rpc, const char *database,
                   "please redirect stdin from a file");
     }
 
-    struct ovsdb *backup;
-    check_ovsdb_error(ovsdb_file_open("/dev/stdin", true, &backup, NULL));
+    struct ovsdb_storage *storage = open_standalone_db("/dev/stdin", false);
+    struct ovsdb_schema *schema = read_schema_from_storage(storage);
+    struct ovsdb *backup = read_ovsdb_txns(schema, storage, true);
+    ovsdb_storage_close(storage);
+    backup->storage = NULL;
 
-    const struct ovsdb_schema *schema = backup->schema;
     struct ovsdb_schema *schema2 = fetch_schema(rpc, database);
     if (!ovsdb_schema_equal(schema, schema2)) {
         struct ds s = DS_EMPTY_INITIALIZER;
@@ -1724,20 +1811,7 @@  do_restore(struct jsonrpc *rpc, const char *database,
     struct jsonrpc_msg *rq = jsonrpc_create_request("transact", txn, NULL);
     struct jsonrpc_msg *reply;
     check_txn(jsonrpc_transact_block(rpc, rq, &reply), &reply);
-    if (reply->result->type != JSON_ARRAY) {
-        ovs_fatal(0, "result is not array");
-    }
-    for (size_t i = 0; i < json_array(reply->result)->n; i++) {
-        struct json *json = json_array(reply->result)->elems[i];
-        if (json->type != JSON_OBJECT) {
-            ovs_fatal(0, "result array element is not object");
-        }
-        struct shash *object = json_object(json);
-        if (shash_find(object, "error")) {
-            ovs_fatal(0, "server returned error reply: %s",
-                      json_to_string(json, JSSF_SORT));
-        }
-    }
+    check_transaction_reply(reply);
     jsonrpc_msg_destroy(reply);
 }
 
@@ -1936,6 +2010,58 @@  do_lock_unlock(struct jsonrpc *rpc, const char *database OVS_UNUSED,
     do_lock(rpc, "unlock", argv[0]);
 }
 
+static void
+do_wait(struct jsonrpc *rpc, const char *unused OVS_UNUSED,
+        int argc OVS_UNUSED, char *argv[])
+{
+    const char *database = argv[0];
+    const char *state = argv[1];
+
+    db_change_aware = 1;
+    send_db_change_aware(rpc);
+
+    struct json *txn = json_array_create_empty();
+    json_array_add(txn, json_string_create("_Server"));
+
+    struct json *op = json_object_create();
+    json_array_add(txn, op);
+    json_object_put_string(op, "op", "wait");
+    json_object_put_string(op, "table", "Database");
+    json_object_put(op, "where",
+                    json_array_create_1(
+                        json_array_create_3(
+                            json_string_create("name"),
+                            json_string_create("=="),
+                            json_string_create(database))));
+
+    if (!strcmp(state, "connected")) {
+        /* Wait until connected == true. */
+        json_object_put(op, "columns",
+                        json_array_create_1(json_string_create("connected")));
+        json_object_put_string(op, "until", "==");
+
+        struct json *row = json_object_create();
+        json_object_put(row, "connected", json_boolean_create(true));
+        json_object_put(op, "rows", json_array_create_1(row));
+    } else if (!strcmp(state, "added") || !strcmp(state, "removed")) {
+        /* Wait until such a row exists, or not, respectively.  */
+        json_object_put(op, "columns", json_array_create_empty());
+        json_object_put_string(op, "until", "==");
+        json_object_put(op, "rows",
+                        (!strcmp(state, "added")
+                         ? json_array_create_1(json_object_create())
+                         : json_array_create_empty()));
+    } else {
+        ovs_fatal(0, "%s: unknown state", state);
+    }
+
+    struct jsonrpc_msg *rq = jsonrpc_create_request("transact", txn, NULL);
+    struct jsonrpc_msg *reply;
+    check_txn(jsonrpc_transact_block(rpc, rq, &reply), &reply);
+    check_transaction_reply(reply);
+    jsonrpc_msg_destroy(reply);
+}
+
 /* All command handlers (except for "help") are expected to take an optional
  * server socket name (e.g. "unix:...") as their first argument.  The socket
  * name argument must be included in max_args (but left out of min_args).  The
@@ -1954,6 +2080,7 @@  static const struct ovsdb_client_command all_commands[] = {
     { "query",              NEED_RPC,      1, 1,       do_query },
     { "monitor",            NEED_DATABASE, 1, INT_MAX, do_monitor },
     { "monitor-cond",       NEED_DATABASE, 2, 3,       do_monitor_cond },
+    { "wait",               NEED_RPC,      2, 2,       do_wait },
     { "convert",            NEED_RPC,      1, 1,       do_convert },
     { "needs-conversion",   NEED_RPC,      1, 1,       do_needs_conversion },
     { "dump",               NEED_DATABASE, 0, INT_MAX, do_dump },
diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in
index 02a887b94935..88a17bb3713f 100644
--- a/ovsdb/ovsdb-server.1.in
+++ b/ovsdb/ovsdb-server.1.in
@@ -40,12 +40,21 @@  see \fBovsdb\fR(7).
 Each OVSDB file may be specified on the command line as \fIdatabase\fR.
 If none is specified, the default is \fB@DBDIR@/conf.db\fR.  The database
 files must already have been created and initialized using, for
-example, \fBovsdb\-tool create\fR.
+example, \fBovsdb\-tool\fR's \fBcreate\fR, \fBcreate\-cluster\fR, or
+\fBjoin\-cluster\fR command.
 .PP
-This OVSDB implementation supports standalone and active-backup
-databases, as well as database replication.
+This OVSDB implementation supports standalone, active-backup, and
+clustered database service models, as well as database replication.
 See the Service Models section of \fBovsdb\fR(7) for more information.
 .PP
+For clustered databases, when the \fB\-\-detach\fR option is used,
+\fBovsdb\-server\fR detaches without waiting for the server to
+successfully joining a cluster (if the database file is freshly
+created with \fBovsdb\-tool join\-cluster\fR) or connecting to a
+cluster that it has already joined.  Use \fBovsdb\-client wait\fR (see
+\fBovsdb\-client\fR(1)) to wait until the server has successfully
+joined and connected to a cluster.
+.PP
 In addition to user-specified databases, \fBovsdb\-server\fR version
 2.9 and later also always hosts a built-in database named
 \fB_Server\fR.  Please see \fBovsdb\-server\fR(5) for documentation on
@@ -110,7 +119,10 @@  This option is not supported on Windows platform.
 .SS "Daemon Options"
 .ds DD \
 \fBovsdb\-server\fR detaches only after it starts listening on all \
-configured remotes.
+configured remotes.  At this point, all standalone and active-backup \
+databases are ready for use.  Clustered databases only become ready \
+for use after they finish joining their clusters (which could have \
+already happened in previous runs of \fBovsdb\-server\fR).
 .so lib/daemon.man
 .SS "Service Options"
 .so lib/service.man
@@ -118,7 +130,9 @@  configured remotes.
 .so lib/vlog.man
 .SS "Active-Backup Options"
 These options support the \fBovsdb\-server\fR active-backup service
-model and database replication.  By
+model and database replication.  These options apply only to databases
+in the format used for standalone and active-backup databases, which
+is the database format created by \fBovsdb\-tool create\fR.  By
 default, when it serves a database in this format, \fBovsdb\-server\fR
 runs as a standalone server.  These options can configure it for
 active-backup use:
@@ -292,6 +306,60 @@  When the connection is in \fIreplicating\fR state, further output shows
 the list of databases currently replicating, and the tables that are
 excluded.
 .
+.SS "Cluster Commands"
+These commands support the \fBovsdb\-server\fR clustered service model.
+They apply only to databases in the format used for clustered databases,
+which is the database format created by \fBovsdb\-tool create\-cluster\fR
+and \fBovsdb\-tool join\-cluster\fR.
+.
+.IP "\fBcluster/cid \fIdb\fR"
+Prints the cluster ID for \fIdb\fR, which is a UUID that identifies
+the cluster.  If \fIdb\fR is a database newly created by
+\fBovsdb\-tool cluster\-join\fR, that has not yet successfully joined
+its cluster, and \fB\-\-cid\fR was not specified on the
+\fBcluster\-join\fR command line, then this command will report an
+error because the cluster ID is not yet known.
+.
+.IP "\fBcluster/sid \fIdb\fR"
+Prints the server ID for \fIdb\fR, which is a UUID that identifies
+this server within the cluster.
+.
+.IP "\fBcluster/status \fIdb\fR"
+Prints this server's status within the cluster and the status of its
+connections to other servers in the cluster.
+.
+.IP "\fBcluster/leave \fR[\fB\-\-force\fR] \fIdb\fR"
+.IP
+Without \fB\-\-force\fR, this command starts the server gracefully
+removing itself from its cluster.  At least one server must remain,
+and the cluster must be healthy, that is, over half its servers are
+up.
+.IP
+With \fB\-\-force\fR, this command forces the server to leave its
+cluster and form a new single-node cluster that contains only itself.
+The data in the new cluster may be inconsistent with the former
+cluster: transactions not yet replicated to the server will be lost,
+and transactions not yet applied to the cluster may be committed.
+Afterward, any servers in its former cluster will regard the server to
+have failed.
+.IP
+When the server successfully leaves the cluster, it stops serving
+\fIdb\fR, as if \fBovsdb\-server/remove\-db \fIdb\fR had been
+executed.
+.IP
+Use \fBovsdb\-client wait\fR (see \fBovsdb\-client\fR(1)) to wait
+until the server has left the cluster.
+.
+.IP "\fBcluster/kick \fIdb server\fR"
+Start graceful removal of \fIserver\fR from \fIdb\fR's cluster, like
+\fBcluster/leave\fR (without \fB\-\-force\fR) except that it can
+remove any server, not just this one.
+.IP
+\fIserver\fR may be a server ID, as printed by \fBcluster/sid\fR, or
+the server's local network address as passed to \fBovsdb-tool\fR's
+\fBcreate\-cluster\fR or \fBjoin\-cluster\fR command.  Use
+\fBcluster/status\fR to see a list of cluster members.
+.
 .so lib/vlog-unixctl.man
 .so lib/memory-unixctl.man
 .so lib/coverage-unixctl.man
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
index 65cf61ade954..904b80d9b0bf 100644
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -49,6 +49,7 @@ 
 #include "stream-ssl.h"
 #include "stream.h"
 #include "sset.h"
+#include "storage.h"
 #include "table.h"
 #include "timeval.h"
 #include "transaction.h"
@@ -63,7 +64,6 @@  VLOG_DEFINE_THIS_MODULE(ovsdb_server);
 
 struct db {
     char *filename;
-    struct ovsdb_file *file;
     struct ovsdb *db;
     struct uuid row_uuid;
 };
@@ -107,9 +107,14 @@  static unixctl_cb_func ovsdb_server_add_database;
 static unixctl_cb_func ovsdb_server_remove_database;
 static unixctl_cb_func ovsdb_server_list_databases;
 
-static char *open_db(struct server_config *config, const char *filename);
+static struct ovsdb_error *read_db(struct server_config *, struct db *)
+    OVS_WARN_UNUSED_RESULT;
+static struct ovsdb_error *open_db(struct server_config *,
+                                   const char *filename)
+    OVS_WARN_UNUSED_RESULT;
 static void add_server_db(struct server_config *);
-static void close_db(struct db *db);
+static void remove_db(struct server_config *, struct shash_node *db);
+static void close_db(struct server_config *, struct db *);
 
 static void parse_options(int argc, char *argvp[],
                           struct sset *db_filenames, struct sset *remotes,
@@ -153,7 +158,18 @@  ovsdb_replication_init(const char *sync_from, const char *exclude,
 }
 
 static void
-main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs,
+log_and_free_error(struct ovsdb_error *error)
+{
+    if (error) {
+        char *s = ovsdb_error_to_string_free(error);
+        VLOG_INFO("%s", s);
+        free(s);
+    }
+}
+
+static void
+main_loop(struct server_config *config,
+          struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs,
           struct unixctl_server *unixctl, struct sset *remotes,
           struct process *run_process, bool *exiting, bool *is_backup)
 {
@@ -201,11 +217,21 @@  main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs,
             }
         }
 
-        SHASH_FOR_EACH(node, all_dbs) {
+        struct shash_node *next;
+        SHASH_FOR_EACH_SAFE (node, next, all_dbs) {
             struct db *db = node->data;
             if (ovsdb_trigger_run(db->db, time_msec())) {
                 ovsdb_jsonrpc_server_reconnect(jsonrpc, false);
             }
+            ovsdb_storage_run(db->db->storage);
+            log_and_free_error(read_db(config, db));
+            if (ovsdb_storage_is_dead(db->db->storage)) {
+                VLOG_INFO("%s: removing database because storage disconnected "
+                          "permanently", node->name);
+                remove_db(config, node);
+            } else if (ovsdb_storage_should_snapshot(db->db->storage)) {
+                log_and_free_error(ovsdb_snapshot(db->db));
+            }
         }
         if (run_process) {
             process_run();
@@ -232,6 +258,8 @@  main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs,
         SHASH_FOR_EACH(node, all_dbs) {
             struct db *db = node->data;
             ovsdb_trigger_wait(db->db, time_msec());
+            ovsdb_storage_wait(db->db->storage);
+            ovsdb_storage_read_wait(db->db->storage);
         }
         if (run_process) {
             process_wait(run_process);
@@ -267,7 +295,6 @@  main(int argc, char *argv[])
     struct server_config server_config;
     struct shash all_dbs;
     struct shash_node *node, *next;
-    char *error;
 
     ovs_cmdl_proctitle_init(argc, argv);
     set_program_name(argv[0]);
@@ -319,14 +346,15 @@  main(int argc, char *argv[])
     perf_counters_init();
 
     SSET_FOR_EACH (db_filename, &db_filenames) {
-        error = open_db(&server_config, db_filename);
+        struct ovsdb_error *error = open_db(&server_config, db_filename);
         if (error) {
-            ovs_fatal(0, "%s", error);
+            char *s = ovsdb_error_to_string_free(error);
+            ovs_fatal(0, "%s", s);
         }
     }
     add_server_db(&server_config);
 
-    error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes);
+    char *error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes);
     if (!error) {
         error = reconfigure_ssl(&all_dbs);
     }
@@ -420,15 +448,15 @@  main(int argc, char *argv[])
         ovsdb_replication_init(sync_from, sync_exclude, &all_dbs, server_uuid);
     }
 
-    main_loop(jsonrpc, &all_dbs, unixctl, &remotes, run_process, &exiting,
-              &is_backup);
+    main_loop(&server_config, jsonrpc, &all_dbs, unixctl, &remotes,
+              run_process, &exiting, &is_backup);
 
-    ovsdb_jsonrpc_server_destroy(jsonrpc);
     SHASH_FOR_EACH_SAFE(node, next, &all_dbs) {
         struct db *db = node->data;
-        close_db(db);
+        close_db(&server_config, db);
         shash_delete(&all_dbs, node);
     }
+    ovsdb_jsonrpc_server_destroy(jsonrpc);
     shash_destroy(&all_dbs);
     sset_destroy(&remotes);
     sset_destroy(&db_filenames);
@@ -480,56 +508,161 @@  is_already_open(struct server_config *config OVS_UNUSED,
 }
 
 static void
-close_db(struct db *db)
+close_db(struct server_config *config, struct db *db)
+{
+    if (db) {
+        ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db);
+        ovsdb_destroy(db->db);
+        free(db->filename);
+        free(db);
+    }
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+parse_txn(struct server_config *config, struct db *db,
+          struct ovsdb_schema *schema, const struct json *txn_json,
+          const struct uuid *txnid)
+{
+    if (schema) {
+        /* We're replacing the schema (and the data).  Destroy the database
+         * (first grabbing its storage), then replace it with the new schema.
+         * The transaction must also include the replacement data. */
+        ovs_assert(txn_json);
+
+        ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db);
+        struct ovsdb_storage *storage = db->db->storage;
+        db->db->storage = NULL;
+        ovsdb_destroy(db->db);
+
+        db->db = ovsdb_create(schema, storage);
+        ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db);
+
+        /* Force update to schema in _Server database. */
+        db->row_uuid = UUID_ZERO;
+    }
+
+    if (txn_json) {
+        if (!db->db->schema) {
+            return ovsdb_error(NULL, "%s: data without schema", db->filename);
+        }
+
+        struct ovsdb_txn *txn;
+        struct ovsdb_error *error;
+
+        error = ovsdb_file_txn_from_json(db->db, txn_json, false, &txn);
+        if (!error) {
+            log_and_free_error(ovsdb_txn_replay_commit(txn));
+        }
+        if (!error && !uuid_is_zero(txnid)) {
+            db->db->prereq = *txnid;
+        }
+        if (error) {
+            ovsdb_storage_unread(db->db->storage);
+            return error;
+        }
+    }
+
+    return NULL;
+}
+
+/* XXX how to describe the return value for this function? */
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+read_db(struct server_config *config, struct db *db)
 {
-    ovsdb_destroy(db->db);
-    free(db->filename);
-    free(db);
+    struct ovsdb_error *error;
+    for (;;) {
+        struct ovsdb_schema *schema;
+        struct json *txn_json;
+        struct uuid txnid;
+        error = ovsdb_storage_read(db->db->storage, &schema, &txn_json,
+                                   &txnid);
+        if (error) {
+            break;
+        } else if (!schema && !txn_json) {
+            /* End of file. */
+            return NULL;
+        } else {
+            error = parse_txn(config, db, schema, txn_json, &txnid);
+            json_destroy(txn_json);
+            if (error) {
+                break;
+            }
+        }
+    }
+
+    /* Log error but otherwise ignore it.  Probably the database just
+     * got truncated due to power failure etc. and we should use its
+     * current contents. */
+    char *msg = ovsdb_error_to_string_free(error);
+    VLOG_ERR("%s", msg);
+    free(msg);
+
+    return NULL;
 }
 
 static void
-add_db(struct server_config *config, const char *name, struct db *db)
+add_db(struct server_config *config, struct db *db)
 {
     db->row_uuid = UUID_ZERO;
-    shash_add_assert(config->all_dbs, name, db);
-    bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc,
-                                                     db->db);
-    ovs_assert(ok);
+    shash_add_assert(config->all_dbs, db->db->name, db);
 }
 
-static char *
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 open_db(struct server_config *config, const char *filename)
 {
-    struct ovsdb_error *db_error;
     struct db *db;
-    char *error;
 
     /* If we know that the file is already open, return a good error message.
      * Otherwise, if the file is open, we'll fail later on with a harder to
      * interpret file locking error. */
     if (is_already_open(config, filename)) {
-        return xasprintf("%s: already open", filename);
+        return ovsdb_error(NULL, "%s: already open", filename);
+    }
+
+    struct ovsdb_storage *storage;
+    struct ovsdb_error *error;
+    error = ovsdb_storage_open(filename, true, &storage);
+    if (error) {
+        return error;
     }
 
     db = xzalloc(sizeof *db);
     db->filename = xstrdup(filename);
 
-    db_error = ovsdb_file_open(db->filename, false, &db->db, &db->file);
-    if (db_error) {
-        error = ovsdb_error_to_string(db_error);
-    } else if (db->db->schema->name[0] == '_') {
-        error = xasprintf("%s: names beginning with \"_\" are reserved",
-                          db->db->schema->name);
-    } else if (!ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db)) {
-        error = xasprintf("%s: duplicate database name", db->db->schema->name);
+    struct ovsdb_schema *schema;
+    if (ovsdb_storage_is_clustered(storage)) {
+        schema = NULL;
     } else {
-        shash_add_assert(config->all_dbs, db->db->schema->name, db);
-        return NULL;
+        struct json *txn_json;
+        error = ovsdb_storage_read(storage, &schema, &txn_json, NULL);
+        if (error) {
+            ovsdb_storage_close(storage);
+            return error;
+        }
+        ovs_assert(schema && !txn_json);
+    }
+    db->db = ovsdb_create(schema, storage);
+    ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db);
+
+    error = read_db(config, db);
+    if (error) {
+        close_db(config, db);
+        return error;
+    }
+
+    error = (db->db->name[0] == '_'
+             ? ovsdb_error(NULL, "%s: names beginning with \"_\" are reserved",
+                           db->db->name)
+             : shash_find(config->all_dbs, db->db->name)
+             ? ovsdb_error(NULL, "%s: duplicate database name", db->db->name)
+             : NULL);
+    if (error) {
+        close_db(config, db);
+        return error;
     }
 
-    ovsdb_error_destroy(db_error);
-    close_db(db);
-    return error;
+    add_db(config, db);
+    return NULL;
 }
 
 /* Add the internal _Server database to the server configuration. */
@@ -549,8 +682,10 @@  add_server_db(struct server_config *config)
 
     struct db *db = xzalloc(sizeof *db);
     db->filename = xstrdup("<internal>");
-    db->db = ovsdb_create(schema);
-    add_db(config, db->db->schema->name, db);
+    db->db = ovsdb_create(schema, ovsdb_storage_create_unbacked());
+    bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db);
+    ovs_assert(ok);
+    add_db(config, db);
 }
 
 static char * OVS_WARN_UNUSED_RESULT
@@ -905,7 +1040,7 @@  update_remote_rows(const struct shash *all_dbs, const struct db *db_,
 static void
 commit_txn(struct ovsdb_txn *txn, const char *name)
 {
-    struct ovsdb_error *error = ovsdb_txn_commit(txn, false);
+    struct ovsdb_error *error = ovsdb_txn_propose_commit_block(txn, false);
     if (error) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
         char *msg = ovsdb_error_to_string_free(error);
@@ -922,15 +1057,16 @@  update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc,
     struct shash_node *node;
     SHASH_FOR_EACH (node, all_dbs) {
         struct db *db = node->data;
-        struct ovsdb_txn *txn = ovsdb_txn_create(db->db);
+        if (!db->db || ovsdb_storage_is_clustered(db->db->storage)) {
+            continue;
+        }
 
-        /* Iterate over --remote arguments given on command line. */
+        struct ovsdb_txn *txn = ovsdb_txn_create(db->db);
         const char *remote;
         SSET_FOR_EACH (remote, remotes) {
             update_remote_rows(all_dbs, db, remote, jsonrpc, txn);
         }
-
-        commit_txn(txn, node->name);
+        commit_txn(txn, "remote status");
     }
 }
 
@@ -939,7 +1075,17 @@  update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc,
 static void
 update_database_status(struct ovsdb_row *row, struct db *db)
 {
-    ovsdb_util_write_string_column(row, "name", db->db->schema->name);
+    ovsdb_util_write_string_column(row, "name", db->db->name);
+    ovsdb_util_write_string_column(row, "model",
+                                   ovsdb_storage_get_model(db->db->storage));
+    ovsdb_util_write_bool_column(row, "connected",
+                                 ovsdb_storage_is_connected(db->db->storage));
+    ovsdb_util_write_bool_column(row, "leader",
+                                 ovsdb_storage_is_leader(db->db->storage));
+    ovsdb_util_write_uuid_column(row, "cid",
+                                 ovsdb_storage_get_cid(db->db->storage));
+    ovsdb_util_write_uuid_column(row, "sid",
+                                 ovsdb_storage_get_sid(db->db->storage));
 
     const struct uuid *row_uuid = ovsdb_row_get_uuid(row);
     if (!uuid_equals(row_uuid, &db->row_uuid)) {
@@ -948,11 +1094,14 @@  update_database_status(struct ovsdb_row *row, struct db *db)
         /* The schema can only change if the generation changes, so only update
          * it in that case.  (Schemas are often kilobytes in size and expensive
          * to serialize, so presumably it's worth optimizing.) */
-        struct json *json_schema = ovsdb_schema_to_json(db->db->schema);
-        char *schema = json_to_string(json_schema, JSSF_SORT);
+        char *schema = NULL;
+        if (db->db->schema) {
+            struct json *json_schema = ovsdb_schema_to_json(db->db->schema);
+            schema = json_to_string(json_schema, JSSF_SORT);
+            json_destroy(json_schema);
+        }
         ovsdb_util_write_string_column(row, "schema", schema);
         free(schema);
-        json_destroy(json_schema);
     }
 }
 
@@ -1228,7 +1377,6 @@  ovsdb_server_compact(struct unixctl_conn *conn, int argc,
 {
     struct shash *all_dbs = dbs_;
     struct ds reply;
-    struct db *db;
     struct shash_node *node;
     int n = 0;
 
@@ -1239,22 +1387,24 @@  ovsdb_server_compact(struct unixctl_conn *conn, int argc,
 
     ds_init(&reply);
     SHASH_FOR_EACH(node, all_dbs) {
-        db = node->data;
+        struct db *db = node->data;
         if (argc < 2
             ? node->name[0] != '_'
             : !strcmp(argv[1], node->name)) {
-            struct ovsdb_error *error;
-
-            VLOG_INFO("compacting %s database by user request", node->name);
+            if (db->db) {
+                VLOG_INFO("compacting %s database by user request",
+                          node->name);
+
+                struct ovsdb_error *error = ovsdb_snapshot(db->db);
+                if (error) {
+                    char *s = ovsdb_error_to_string(error);
+                    ds_put_format(&reply, "%s\n", s);
+                    free(s);
+                    ovsdb_error_destroy(error);
+                }
 
-            error = ovsdb_file_compact(db->file);
-            if (error) {
-                char *s = ovsdb_error_to_string_free(error);
-                ds_put_format(&reply, "%s\n", s);
-                free(s);
+                n++;
             }
-
-            n++;
         }
     }
 
@@ -1356,9 +1506,8 @@  ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED,
 {
     struct server_config *config = config_;
     const char *filename = argv[1];
-    char *error;
 
-    error = open_db(config, filename);
+    char *error = ovsdb_error_to_string_free(open_db(config, filename));
     if (!error) {
         save_config(config);
         if (*config->is_backup) {
@@ -1379,9 +1528,7 @@  remove_db(struct server_config *config, struct shash_node *node)
 {
     struct db *db = node->data;
 
-    ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db);
-
-    close_db(db);
+    close_db(config, db);
     shash_delete(config->all_dbs, node);
 
     save_config(config);
@@ -1427,7 +1574,11 @@  ovsdb_server_list_databases(struct unixctl_conn *conn, int argc OVS_UNUSED,
 
     nodes = shash_sort(all_dbs);
     for (i = 0; i < shash_count(all_dbs); i++) {
-        ds_put_format(&s, "%s\n", nodes[i]->name);
+        const struct shash_node *node = nodes[i];
+        struct db *db = node->data;
+        if (db->db) {
+            ds_put_format(&s, "%s\n", node->name);
+        }
     }
     free(nodes);
 
diff --git a/ovsdb/ovsdb-tool.1.in b/ovsdb/ovsdb-tool.1.in
index f6144e7021c0..c7caeade0dcf 100644
--- a/ovsdb/ovsdb-tool.1.in
+++ b/ovsdb/ovsdb-tool.1.in
@@ -15,6 +15,9 @@  ovsdb\-tool \- Open vSwitch database management utility
 .IP "Database Creation Commands:"
 \fBovsdb\-tool \fR[\fIoptions\fR] \fBcreate \fR[\fIdb\fR [\fIschema\fR]]
 .br
+\fBovsdb\-tool \fR[\fIoptions\fR] \fBcreate\-cluster \fIdb contents address\fR
+.br
+\fBovsdb\-tool [\fB\-\-cid=\fIuuid\fR] \fBjoin\-cluster\fI db name local remote\fR...
 .IP "Version Management Commands:"
 \fBovsdb\-tool \fR[\fIoptions\fR] \fBconvert \fR[\fIdb\fR [\fIschema
 \fR[\fItarget\fR]]]
@@ -54,8 +57,11 @@  For an introduction to OVSDB and its implementation in Open vSwitch,
 see \fBovsdb\fR(7).
 .PP
 This OVSDB implementation supports standalone and active-backup
-database service models with a common on-disk format  For a
-specification of this format, see \fBovsdb\fR(5).  For more
+database service models with one on-disk format and a clustered
+database service model with a different format.  \fBovsdb\-tool\fR
+supports both formats, but some commands are appropriate for only one
+format, as documented for individual commands below.  For a
+specification of these formats, see \fBovsdb\fR(5).  For more
 information on OVSDB service models, see the \fBService Models\fR
 section in \fBovsdb\fR(7).
 .
@@ -72,6 +78,64 @@  must be the name of a file that contains an OVSDB schema in JSON
 format, as specified in the OVSDB specification.  The new database is
 initially empty.
 .
+.IP "\fBcreate\-cluster\fI db contents local"
+Use this command to initialize the first server in a high-availability
+cluster of 3 (or more) database servers, e.g. for an OVN northbound or
+southbound database in an environment that cannot tolerate a single
+point of failure.  It creates clustered database file \fIdb\fR and
+configures the server to listen on \fIlocal\fR, which must take the
+form \fIprotocol\fB:\fIip\fB:\fIport\fR, where \fIprotocol\fR is
+\fBtcp\fR or \fBssl\fR, \fIip\fR is the server's IP (either an IPv4
+address or an IPv6 address enclosed in square brackets), and
+\fIport\fR is a TCP port number.  Only one address is specified, for
+the first server in the cluster, ordinarily the one for the server
+running \fBcreate\-cluster\fR.  The address is used for communication
+within the cluster, not for communicating with OVSDB clients, and must
+not use the same port used for the OVSDB protocol.
+.IP
+The new database is initialized with \fIcontents\fR, which must name a
+file that contains either an OVSDB schema in JSON format or a
+standalone OVSDB database.  If it is a schema file, the new database
+will initially be empty, with the given schema.  If it is a database
+file, the new database will have the same schema and contents.
+.
+.IP "[\fB\-\-cid=\fIuuid\fR] \fBjoin\-cluster\fI db name local remote\fR..."
+Use this command to initialize each server after the first one in an
+OVSDB high-availability cluster.  It creates clustered database file
+\fIdb\fR for a database named \fIname\fR, and
+configures the server to listen on \fIlocal\fR and to initially
+connect to \fIremote\fR, which must be a server that already belongs
+to the cluster.  \fIlocal\fR and \fIremote\fR use the same
+\fIprotocol\fB:\fIip\fB:\fIport\fR syntax as \fBcreate\-cluster\fR.
+.IP
+The \fIname\fR must be the name of the schema or database passed to
+\fBcreate\-cluster\fR.  For example, the name of the OVN Southbound
+database schema is \fBOVN_Southbound\fR.  Use \fBovsdb\-tool\fR's
+\fBschema\-name\fR or \fBdb\-name\fR command to find out the name of a
+schema or database, respectively.
+.IP
+This command does not do any network access, which means that it
+cannot actually join the new server to the cluster.  Instead, the
+\fIdb\fR file that it creates prepares the server to join the cluster
+the first time that \fBovsdb\-server\fR serves it.  As part of joining
+the cluster, the new server retrieves the database schema and obtains
+the list of all cluster members.  Only after that does it become a
+full member of the cluster.
+.IP
+Optionally, more than one \fIremote\fR may be specified; for example,
+in a cluster that already contains multiple servers, one could specify
+all the existing servers.  This is beneficial if some of the existing
+servers are down while the new server joins, but it is not otherwise
+needed.
+.IP
+By default, the \fIdb\fR created by \fBjoin\-cluster\fR will join any
+clustered database named \fIname\fR that is available at a
+\fIremote\fR.  In theory, if machines go up and down and IP addresses
+change in the right way, it could join the wrong database cluster.  To
+avoid this possibility, specify \fB\-\-cid=\fIuuid\fR, where
+\fIuuid\fR is the cluster ID of the cluster to join, as printed by
+\fBovsdb\-tool get\-cid\fR.
+.
 .SS "Version Management Commands"
 .so ovsdb/ovsdb-schemas.man
 .PP
@@ -102,11 +166,19 @@  example, converting a database from a schema that has a given column
 or table to one that does not will delete all data in that column or
 table.  Back up critical databases before converting them.
 .IP
+This command is for standalone and active-backup databases only.  For
+clustered databases, use \fBovsdb\-client\fR's \fBconvert\fR command
+to convert them online.
+.
 .IP "\fBneeds\-conversion\fI db schema\fR"
 Reads the schema embedded in \fIdb\fR and the JSON schema from
 \fIschema\fR and compares them.  If the schemas are the same, prints
 \fBno\fR on stdout; if they differ, prints \fByes\fR.
 .IP
+This command is for standalone and active-backup databases only.  For
+clustered databases, use \fBovsdb\-client\fR's \fBneeds-conversion\fR
+command instead.
+.
 .IP "\fBdb\-version\fI db\fR"
 .IQ "\fBschema\-version\fI schema\fR"
 Prints the version number in the schema embedded within the database
@@ -115,6 +187,10 @@  If \fIschema\fR or \fIdb\fR was created before schema versioning was
 introduced, then it will not have a version number and this command
 will print a blank line.
 .IP
+The \fBschema\-version\fR command is for standalone and active-backup
+databases only.  For clustered databases, use \fBovsdb\-client\fR's
+\fBschema\-version\fR command instead.
+.
 .IP "\fBdb\-cksum\fI db\fR"
 .IQ "\fBschema\-cksum\fI schema\fR"
 Prints the checksum in the schema embedded within the database
@@ -123,6 +199,10 @@  If \fIschema\fR or \fIdb\fR was created before schema checksums were
 introduced, then it will not have a checksum and this command
 will print a blank line.
 .IP
+The \fBschema\-cksum\fR command is for standalone and active-backup
+databases only.  For clustered databases, use \fBovsdb\-client\fR's
+\fBschema\-cksum\fR command instead.
+.
 .SS "Other Commands"
 .
 .IP "\fBcompact\fI db \fR[\fItarget\fR]"
@@ -136,8 +216,10 @@  database that grows much larger than its minimum size.
 .IP
 This command does not work if \fIdb\fR is currently being served by
 \fBovsdb\-server\fR, or if it is otherwise locked for writing by
-another process.  Instead, send the \fBovsdb\-server/compact\fR
-command to \fBovsdb\-server\fR, via \fBovs\-appctl\fR).
+another process.  This command also does not work with clustered
+databases.  Instead, in either case, send the
+\fBovsdb\-server/compact\fR command to \fBovsdb\-server\fR, via
+\fBovs\-appctl\fR).
 .
 .IP "[\fB\-\-rbac\-role=\fIrole\fR] \fBquery\fI db transaction\fR"
 Opens \fIdb\fR, executes \fItransaction\fR on it, and prints the
@@ -153,6 +235,10 @@  may specify database modifications, but these will have no effect on
 .IP
 By default, the transaction is executed using the ``superuser'' RBAC
 role.  Use \fB\-\-rbac\-role\fR to specify a different role.
+.IP
+This command does not work with clustered databases.  Instead, use
+\fBovsdb-client\fR's \fBquery\fR command to send the query to
+\fBovsdb\-server\fR.
 .
 .IP "[\fR\-\-rbac\-role=\fIrole\fR] \fBtransact\fI db transaction\fR"
 Opens \fIdb\fR, executes \fItransaction\fR on it, prints the results,
@@ -162,8 +248,9 @@  JSON array in the format of the \fBparams\fR array for the JSON-RPC
 .IP
 This command does not work if \fIdb\fR is currently being served by
 \fBovsdb\-server\fR, or if it is otherwise locked for writing by
-another process.  Instead, use \fBovsdb\-client\fR's \fBtransact\fR
-command to send the query to \fBovsdb\-server\fR.
+another process.  This command also does not work with clustered
+databases.  Instead, in either case, use \fBovsdb\-client\fR's
+\fBtransact\fR command to send the query to \fBovsdb\-server\fR.
 .IP
 By default, the transaction is executed using the ``superuser'' RBAC
 role.  Use \fB\-\-rbac\-role\fR to specify a different role.
@@ -179,12 +266,36 @@  one or more times to the command line.  With one \fB\-m\fR,
 modified by each transaction.  With two \fB\-m\fRs, \fBshow\-log\fR
 also prints the values of the columns modified by each change to a
 record.
+.IP
+This command works with standalone and active-backup databases and
+with clustered databases, but the output formats are different.
 .
 .IP "\fBdb\-name \fR[\fIdb\fR]"
 .IQ "\fBschema\-name \fR[\fIschema\fR]"
 Prints the name of the schema embedded within the database \fIdb\fR or
 in the JSON schema \fIschema\fR on stdout.
 .
+.IP "\fBdb\-cid\fI db\fR"
+Prints the cluster ID, which is a UUID that identifies the cluster,
+for \fIdb\fR.  If \fIdb\fR is a database newly created by
+\fBovsdb\-tool cluster\-join\fR, that has not yet successfully joined
+its cluster, and \fB\-\-cid\fR was not specified on the
+\fBcluster\-join\fR command line, then this command will output an
+error, and exit with status 2, because the cluster ID is not yet
+known.  This command works only with clustered databases.
+.IP
+The all-zeros UUID is not a valid cluster ID.
+.
+.IP "\fBdb\-sid\fI db\fR"
+Prints the server ID, which is a UUID that identifies the server, for
+\fIdb\fR.  This command works only with clustered databases.  It works
+regardless of whether \fIdb\fR has joined the cluster.
+.
+.IP "\fBdb\-local\-address db\fR"
+Prints the address used for database clustering for \fIdb\fR, in the
+same \fIprotocol\fB:\fIip\fB:\fIport\fR form used on
+\fBcreate\-cluster\fR and \fBjoin\-cluster\fR.
+.
 .SH OPTIONS
 .SS "Logging Options"
 .so lib/vlog.man
diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c
index 228bcf1dac54..77ea26076414 100644
--- a/ovsdb/ovsdb-tool.c
+++ b/ovsdb/ovsdb-tool.c
@@ -35,9 +35,13 @@ 
 #include "ovsdb.h"
 #include "ovsdb-data.h"
 #include "ovsdb-error.h"
+#include "ovsdb-parser.h"
+#include "raft.h"
 #include "socket-util.h"
+#include "storage.h"
 #include "table.h"
 #include "timeval.h"
+#include "transaction.h"
 #include "util.h"
 #include "openvswitch/vlog.h"
 
@@ -47,6 +51,9 @@  static int show_log_verbosity;
 /* --role: RBAC role to use for "transact" and "query" commands. */
 static const char *rbac_role;
 
+/* --cid: Cluster ID for "join-cluster" command. */
+static struct uuid cid;
+
 static const struct ovs_cmdl_command *get_all_commands(void);
 
 OVS_NO_RETURN static void usage(void);
@@ -62,6 +69,7 @@  main(int argc, char *argv[])
     set_program_name(argv[0]);
     parse_options(argc, argv);
     fatal_ignore_sigpipe();
+    fatal_signal_init();
     ctx.argc = argc - optind;
     ctx.argv = argv + optind;
     ovs_cmdl_run_command(&ctx, get_all_commands());
@@ -72,11 +80,13 @@  static void
 parse_options(int argc, char *argv[])
 {
     enum {
-        OPT_RBAC_ROLE = UCHAR_MAX + 1
+        OPT_RBAC_ROLE = UCHAR_MAX + 1,
+        OPT_CID
     };
     static const struct option long_options[] = {
         {"more", no_argument, NULL, 'm'},
         {"rbac-role", required_argument, NULL, OPT_RBAC_ROLE},
+        {"cid", required_argument, NULL, OPT_CID},
         {"verbose", optional_argument, NULL, 'v'},
         {"help", no_argument, NULL, 'h'},
         {"option", no_argument, NULL, 'o'},
@@ -102,6 +112,12 @@  parse_options(int argc, char *argv[])
             rbac_role = optarg;
             break;
 
+        case OPT_CID:
+            if (!uuid_from_string(&cid, optarg) || uuid_is_zero(&cid)) {
+                ovs_fatal(0, "%s: not a valid UUID", optarg);
+            }
+            break;
+
         case 'h':
             usage();
 
@@ -133,11 +149,18 @@  usage(void)
     printf("%s: Open vSwitch database management utility\n"
            "usage: %s [OPTIONS] COMMAND [ARG...]\n"
            "  create [DB [SCHEMA]]    create DB with the given SCHEMA\n"
+           "  create-cluster DB CONTENTS LOCAL\n"
+           "    create clustered DB with given CONTENTS and LOCAL address\n"
+           "  [--cid=UUID] join-cluster DB NAME LOCAL REMOTE...\n"
+           "    join clustered DB with given NAME and LOCAL and REMOTE addrs\n"
            "  compact [DB [DST]]      compact DB in-place (or to DST)\n"
            "  convert [DB [SCHEMA [DST]]]   convert DB to SCHEMA (to DST)\n"
            "  db-name [DB]            report name of schema used by DB\n"
            "  db-version [DB]         report version of schema used by DB\n"
            "  db-cksum [DB]           report checksum of schema used by DB\n"
+           "  db-cid DB               report cluster ID of clustered DB\n"
+           "  db-sid DB               report server ID of clustered DB\n"
+           "  db-local-address DB     report local address of clustered DB\n"
            "  schema-name [SCHEMA]    report SCHEMA's name\n"
            "  schema-version [SCHEMA] report SCHEMA's schema version\n"
            "  schema-cksum [SCHEMA]   report SCHEMA's checksum\n"
@@ -203,6 +226,79 @@  check_ovsdb_error(struct ovsdb_error *error)
         ovs_fatal(0, "%s", ovsdb_error_to_string(error));
     }
 }
+
+static struct ovsdb_storage *
+open_standalone_db(struct ovs_cmdl_context *ctx, const char *filename, bool rw)
+{
+    struct ovsdb_storage *storage;
+    check_ovsdb_error(ovsdb_storage_open(filename, rw, &storage));
+    if (ovsdb_storage_is_clustered(storage)) {
+        ovs_fatal(0, "%s: cannot apply %s to clustered database "
+                  "(use ovsdb-client against online database instead)",
+                  filename, ctx->argv[0]);
+    }
+    return storage;
+}
+
+static struct ovsdb_schema *
+read_schema_from_storage(struct ovsdb_storage *storage)
+{
+    struct json *txn_json;
+    struct ovsdb_schema *schema;
+    check_ovsdb_error(ovsdb_storage_read(storage, &schema, &txn_json, NULL));
+    if (!schema && !txn_json) {
+        ovs_fatal(0, "unexpected end of file reading schema");
+    }
+    ovs_assert(schema && !txn_json);
+
+    return schema;
+}
+
+static struct json *
+read_txn_from_storage(struct ovsdb_storage *storage)
+{
+    struct json *txn_json;
+    struct ovsdb_schema *schema;
+    check_ovsdb_error(ovsdb_storage_read(storage, &schema, &txn_json, NULL));
+    ovs_assert(!schema);
+
+    return txn_json;
+}
+
+static struct ovsdb_schema *
+read_schema(struct ovs_cmdl_context *ctx, const char *filename)
+{
+    struct ovsdb_storage *storage = open_standalone_db(ctx, filename, false);
+    struct ovsdb_schema *schema = read_schema_from_storage(storage);
+    ovsdb_storage_close(storage);
+    return schema;
+}
+
+/* The new database takes ownership of the storage. */
+static struct ovsdb *
+read_ovsdb_txns(struct ovsdb_schema *schema, struct ovsdb_storage *storage,
+                bool converting)
+{
+    struct ovsdb *ovsdb = ovsdb_create(schema, storage);
+    for (;;) {
+        struct json *txn_json = read_txn_from_storage(storage);
+        if (!txn_json) {
+            break;
+        }
+
+        struct ovsdb_txn *txn;
+        check_ovsdb_error(ovsdb_file_txn_from_json(ovsdb, txn_json, converting,
+                                                   &txn));
+        json_destroy(txn_json);
+
+        struct ovsdb_error *error = ovsdb_txn_replay_commit(txn);
+        if (error) {
+            ovsdb_storage_unread(storage);
+            break;
+        }
+    }
+    return ovsdb;
+}
 
 static void
 do_create(struct ovs_cmdl_context *ctx)
@@ -220,7 +316,7 @@  do_create(struct ovs_cmdl_context *ctx)
 
     /* Create database file. */
     check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC,
-                                     OVSDB_LOG_CREATE, -1, &log));
+                                     OVSDB_LOG_CREATE_EXCL, -1, &log));
     check_ovsdb_error(ovsdb_log_write(log, json));
     check_ovsdb_error(ovsdb_log_commit(log));
     ovsdb_log_close(log);
@@ -229,46 +325,138 @@  do_create(struct ovs_cmdl_context *ctx)
 }
 
 static void
-compact_or_convert(const char *src_name_, const char *dst_name_,
-                   const struct ovsdb_schema *new_schema,
-                   const char *comment)
+do_create_cluster(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argv[1];
+    const char *src_file_name = ctx->argv[2];
+    const char *local = ctx->argv[3];
+
+    struct ovsdb_schema *schema;
+    struct json *data;
+
+    struct ovsdb_error *error = ovsdb_schema_from_file(src_file_name, &schema);
+    if (!error) {
+        /* It's just a schema file. */
+        data = json_object_create();
+    } else {
+        /* Not a schema file.  Try reading it as a standalone database. */
+        ovsdb_error_destroy(error);
+
+        struct ovsdb_storage *storage
+            = open_standalone_db(ctx, src_file_name, false);
+        schema = read_schema_from_storage(storage);
+        struct ovsdb *ovsdb = read_ovsdb_txns(schema, storage, false);
+        char *comment = xasprintf("created from %s", src_file_name);
+        data = ovsdb_to_txn_json(ovsdb, comment);
+        free(comment);
+        ovsdb_destroy(ovsdb);
+    }
+
+    struct json *schema_json = ovsdb_schema_to_json(schema);
+
+    /* Create database file. */
+    struct json *snapshot = json_array_create_2(schema_json, data);
+    check_ovsdb_error(raft_create_cluster(db_file_name, schema->name,
+                                          local, snapshot));
+    ovsdb_schema_destroy(schema);
+    json_destroy(snapshot);
+}
+
+static void
+do_join_cluster(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argv[1];
+    const char *name = ctx->argv[2];
+    const char *local = ctx->argv[3];
+
+    /* Check for a plausible 'name'. */
+    if (!ovsdb_parser_is_id(name)) {
+        ovs_fatal(0, "%s: not a valid schema name (use \"schema-name\" "
+                  "command to find the correct name)", name);
+    }
+
+    /* Create database file. */
+    struct sset remote_addrs = SSET_INITIALIZER(&remote_addrs);
+    for (size_t i = 4; i < ctx->argc; i++) {
+        sset_add(&remote_addrs, ctx->argv[i]);
+    }
+    check_ovsdb_error(raft_join_cluster(db_file_name, name, local,
+                                        &remote_addrs,
+                                        uuid_is_zero(&cid) ? NULL : &cid));
+    sset_destroy(&remote_addrs);
+}
+
+static struct ovsdb_error *
+write_and_free_json(struct ovsdb_log *log, struct json *json)
+{
+    struct ovsdb_error *error = ovsdb_log_write(log, json);
+    json_destroy(json);
+    return error;
+}
+
+static struct ovsdb_error *
+write_db(const char *file_name, const char *comment, const struct ovsdb *db)
+{
+    struct ovsdb_log *log;
+    struct ovsdb_error *error = ovsdb_log_open(file_name, OVSDB_MAGIC,
+                                               OVSDB_LOG_CREATE, false, &log);
+    if (error) {
+        return error;
+    }
+
+    error = write_and_free_json(log, ovsdb_schema_to_json(db->schema));
+    if (!error) {
+        error = write_and_free_json(log, ovsdb_to_txn_json(db, comment));
+    }
+    ovsdb_log_close(log);
+
+    if (error) {
+        remove(file_name);
+    }
+    return error;
+}
+
+static void
+compact_or_convert(struct ovs_cmdl_context *ctx,
+                   const char *src_name_, const char *dst_name_,
+                   struct ovsdb_schema *new_schema, const char *comment)
 {
-    char *src_name, *dst_name;
-    struct lockfile *src_lock;
-    struct lockfile *dst_lock;
     bool in_place = dst_name_ == NULL;
-    struct ovsdb *db;
-    int retval;
 
     /* Dereference symlinks for source and destination names.  In the in-place
      * case this ensures that, if the source name is a symlink, we replace its
      * target instead of replacing the symlink by a regular file.  In the
      * non-in-place, this has the same effect for the destination name. */
-    src_name = follow_symlinks(src_name_);
-    dst_name = (in_place
-                ? xasprintf("%s.tmp", src_name)
-                : follow_symlinks(dst_name_));
+    char *src_name = follow_symlinks(src_name_);
+    char *dst_name = (in_place
+                      ? xasprintf("%s.tmp", src_name)
+                      : follow_symlinks(dst_name_));
 
     /* Lock the source, if we will be replacing it. */
+    struct lockfile *src_lock = NULL;
     if (in_place) {
-        retval = lockfile_lock(src_name, &src_lock);
+        int retval = lockfile_lock(src_name, &src_lock);
         if (retval) {
             ovs_fatal(retval, "%s: failed to lock lockfile", src_name);
         }
     }
 
     /* Get (temporary) destination and lock it. */
-    retval = lockfile_lock(dst_name, &dst_lock);
+    struct lockfile *dst_lock = NULL;
+    int retval = lockfile_lock(dst_name, &dst_lock);
     if (retval) {
         ovs_fatal(retval, "%s: failed to lock lockfile", dst_name);
     }
 
     /* Save a copy. */
-    check_ovsdb_error(new_schema
-                      ? ovsdb_file_open_as_schema(src_name, new_schema, &db)
-                      : ovsdb_file_open(src_name, true, &db, NULL));
-    check_ovsdb_error(ovsdb_file_save_copy(dst_name, false, comment, db));
-    ovsdb_destroy(db);
+    struct ovsdb_storage *storage = open_standalone_db(ctx, src_name, false);
+    struct ovsdb_schema *old_schema = read_schema_from_storage(storage);
+    struct ovsdb_schema *schema = new_schema ? new_schema : old_schema;
+    struct ovsdb *ovsdb = read_ovsdb_txns(schema, storage, true);
+    ovsdb_storage_close(storage);
+    ovsdb->storage = NULL;
+    check_ovsdb_error(write_db(dst_name, comment, ovsdb));
+    ovsdb_destroy(ovsdb);
 
     /* Replace source. */
     if (in_place) {
@@ -295,7 +483,8 @@  do_compact(struct ovs_cmdl_context *ctx)
     const char *db = ctx->argc >= 2 ? ctx->argv[1] : default_db();
     const char *target = ctx->argc >= 3 ? ctx->argv[2] : NULL;
 
-    compact_or_convert(db, target, NULL, "compacted by ovsdb-tool "VERSION);
+    compact_or_convert(ctx, db, target, NULL,
+                       "compacted by ovsdb-tool "VERSION);
 }
 
 static void
@@ -307,9 +496,8 @@  do_convert(struct ovs_cmdl_context *ctx)
     struct ovsdb_schema *new_schema;
 
     check_ovsdb_error(ovsdb_schema_from_file(schema, &new_schema));
-    compact_or_convert(db, target, new_schema,
+    compact_or_convert(ctx, db, target, new_schema,
                        "converted by ovsdb-tool "VERSION);
-    ovsdb_schema_destroy(new_schema);
 }
 
 static void
@@ -317,9 +505,9 @@  do_needs_conversion(struct ovs_cmdl_context *ctx)
 {
     const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
     const char *schema_file_name = ctx->argc >= 3 ? ctx->argv[2] : default_schema();
-    struct ovsdb_schema *schema1, *schema2;
+    struct ovsdb_schema *schema1 = read_schema(ctx, db_file_name);
+    struct ovsdb_schema *schema2;
 
-    check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema1));
     check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema2));
     puts(ovsdb_schema_equal(schema1, schema2) ? "no" : "yes");
     ovsdb_schema_destroy(schema1);
@@ -330,20 +518,29 @@  static void
 do_db_name(struct ovs_cmdl_context *ctx)
 {
     const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
-    struct ovsdb_schema *schema;
 
-    check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema));
-    puts(schema->name);
-    ovsdb_schema_destroy(schema);
+    struct ovsdb_storage *storage;
+    check_ovsdb_error(ovsdb_storage_open(db_file_name, false, &storage));
+
+    const char *name = ovsdb_storage_get_name(storage);
+    if (name) {
+        /* Clustered databases. */
+        puts(name);
+    } else {
+        /* Standalone databases. */
+        struct ovsdb_schema *schema = read_schema_from_storage(storage);
+        puts(schema->name);
+        ovsdb_schema_destroy(schema);
+    }
+    ovsdb_storage_close(storage);
 }
 
 static void
 do_db_version(struct ovs_cmdl_context *ctx)
 {
     const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
-    struct ovsdb_schema *schema;
+    struct ovsdb_schema *schema = read_schema(ctx, db_file_name);
 
-    check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema));
     puts(schema->version);
     ovsdb_schema_destroy(schema);
 }
@@ -352,74 +549,111 @@  static void
 do_db_cksum(struct ovs_cmdl_context *ctx)
 {
     const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
-    struct ovsdb_schema *schema;
-
-    check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema));
+    struct ovsdb_schema *schema = read_schema(ctx, db_file_name);
     puts(schema->cksum);
     ovsdb_schema_destroy(schema);
 }
 
 static void
-do_schema_version(struct ovs_cmdl_context *ctx)
+do_db_cid(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argv[1];
+    struct raft_metadata md;
+
+    check_ovsdb_error(raft_read_metadata(db_file_name, &md));
+    if (uuid_is_zero(&md.cid)) {
+        fprintf(stderr, "%s: cluster ID not yet known\n", db_file_name);
+        exit(2);
+    }
+    printf(UUID_FMT"\n", UUID_ARGS(&md.cid));
+    raft_metadata_destroy(&md);
+}
+
+static void
+do_db_sid(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argv[1];
+    struct raft_metadata md;
+
+    check_ovsdb_error(raft_read_metadata(db_file_name, &md));
+    printf(UUID_FMT"\n", UUID_ARGS(&md.sid));
+    raft_metadata_destroy(&md);
+}
+
+static void
+do_db_local_address(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argv[1];
+    struct raft_metadata md;
+
+    check_ovsdb_error(raft_read_metadata(db_file_name, &md));
+    puts(md.local);
+    raft_metadata_destroy(&md);
+}
+
+static void
+do_schema_name(struct ovs_cmdl_context *ctx)
 {
     const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema();
     struct ovsdb_schema *schema;
 
     check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema));
-    puts(schema->version);
+    puts(schema->name);
     ovsdb_schema_destroy(schema);
 }
 
 static void
-do_schema_cksum(struct ovs_cmdl_context *ctx)
+do_schema_version(struct ovs_cmdl_context *ctx)
 {
     const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema();
     struct ovsdb_schema *schema;
 
     check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema));
-    puts(schema->cksum);
+    puts(schema->version);
     ovsdb_schema_destroy(schema);
 }
 
 static void
-do_schema_name(struct ovs_cmdl_context *ctx)
+do_schema_cksum(struct ovs_cmdl_context *ctx)
 {
     const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema();
     struct ovsdb_schema *schema;
 
     check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema));
-    puts(schema->name);
+    puts(schema->cksum);
     ovsdb_schema_destroy(schema);
 }
 
 static void
-transact(bool read_only, int argc, char *argv[])
+transact(struct ovs_cmdl_context *ctx, bool rw)
 {
-    const char *db_file_name = argc >= 3 ? argv[1] : default_db();
-    const char *transaction = argv[argc - 1];
-    struct json *request, *result;
-    struct ovsdb *db;
+    const char *db_file_name = ctx->argc >= 3 ? ctx->argv[1] : default_db();
+    const char *transaction = ctx->argv[ctx->argc - 1];
 
-    check_ovsdb_error(ovsdb_file_open(db_file_name, read_only, &db, NULL));
+    struct ovsdb_storage *storage = open_standalone_db(ctx, db_file_name, rw);
+    struct ovsdb_schema *schema = read_schema_from_storage(storage);
 
-    request = parse_json(transaction);
-    result = ovsdb_execute(db, NULL, request, false, rbac_role, NULL, 0, NULL);
+    struct ovsdb *ovsdb = read_ovsdb_txns(schema, storage, false);
+
+    struct json *request = parse_json(transaction);
+    struct json *result = ovsdb_execute(ovsdb, NULL, request, false,
+                                        rbac_role, NULL, 0, NULL);
     json_destroy(request);
 
     print_and_free_json(result);
-    ovsdb_destroy(db);
+    ovsdb_destroy(ovsdb);
 }
 
 static void
 do_query(struct ovs_cmdl_context *ctx)
 {
-    transact(true, ctx->argc, ctx->argv);
+    transact(ctx, false);
 }
 
 static void
 do_transact(struct ovs_cmdl_context *ctx)
 {
-    transact(false, ctx->argc, ctx->argv);
+    transact(ctx, true);
 }
 
 static void
@@ -535,16 +769,12 @@  print_db_changes(struct shash *tables, struct shash *names,
 }
 
 static void
-do_show_log(struct ovs_cmdl_context *ctx)
+do_show_log_standalone(struct ovsdb_log *log)
 {
-    const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
     struct shash names;
-    struct ovsdb_log *log;
     struct ovsdb_schema *schema;
     unsigned int i;
 
-    check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC,
-                                     OVSDB_LOG_READ_ONLY, -1, &log));
     shash_init(&names);
     schema = NULL;
     for (i = 0; ; i++) {
@@ -592,12 +822,163 @@  do_show_log(struct ovs_cmdl_context *ctx)
         putchar('\n');
     }
 
-    ovsdb_log_close(log);
     ovsdb_schema_destroy(schema);
     /* XXX free 'names'. */
 }
 
 static void
+print_member(const struct shash *object, const char *name)
+{
+    const struct json *value = shash_find_data(object, name);
+    if (!value) {
+        return;
+    }
+
+    char *s = json_to_string(value, JSSF_SORT);
+    printf("\t%s: %s\n", name, s);
+    free(s);
+}
+
+static void
+print_uuid(const struct shash *object, const char *name)
+{
+    const struct json *value = shash_find_data(object, name);
+    if (!value) {
+        return;
+    }
+
+    printf("\t%s: ", name);
+    if (value->type == JSON_STRING) {
+        printf("%.4s\n", value->u.string);
+    } else {
+        printf("***invalid*\n");
+    }
+}
+
+static void
+print_servers(const struct shash *object, const char *name)
+{
+    const struct json *value = shash_find_data(object, name);
+    if (!value) {
+        return;
+    }
+
+    printf("\t%s: ", name);
+    if (value->type != JSON_OBJECT) {
+        printf("***invalid %s***\n", name);
+    }
+
+    const struct shash_node *node;
+    int i = 0;
+    SHASH_FOR_EACH (node, json_object(value)) {
+        if (i++ > 0) {
+            printf(", ");
+        }
+        printf("%.4s(", node->name);
+
+        const struct json *address = node->data;
+        if (address->type != JSON_STRING) {
+            printf("***invalid***");
+        } else {
+            fputs(address->u.string, stdout);
+        }
+
+        printf(")");
+    }
+    printf("\n");
+}
+
+static void
+print_data(const struct shash *object, const char *name)
+{
+    const struct json *data = shash_find_data(object, name);
+    if (!data) {
+        return;
+    }
+
+    if (data->type != JSON_ARRAY || json_array(data)->n != 2) {
+        printf("\t***invalid data***\n");
+        return;
+    }
+
+    const struct json *schema_json = json_array(data)->elems[0];
+    if (schema_json->type != JSON_NULL) {
+        struct ovsdb_schema *schema;
+
+        check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema));
+        printf("\tschema: \"%s\", version=\"%s\", cksum=\"%s\"\n",
+               schema->name, schema->version, schema->cksum);
+        ovsdb_schema_destroy(schema);
+    }
+
+    char *s = json_to_string(json_array(data)->elems[1], JSSF_SORT);
+    printf("\t%s: %s\n", name, s);
+    free(s);
+}
+
+static void
+do_show_log_cluster(struct ovsdb_log *log)
+{
+    struct shash names;
+    struct ovsdb_schema *schema;
+    unsigned int i;
+
+    shash_init(&names);
+    schema = NULL;
+    for (i = 0; ; i++) {
+        struct json *json;
+        check_ovsdb_error(ovsdb_log_read(log, &json));
+        if (!json) {
+            break;
+        }
+
+        struct shash *object = json_object(json);
+
+        printf("record %u:\n", i);
+        if (i == 0) {
+            print_member(object, "name");
+            print_member(object, "address");
+            print_uuid(object, "server_id");
+            print_uuid(object, "cluster_id");
+
+            print_servers(object, "prev_servers");
+            print_member(object, "prev_term");
+            print_member(object, "prev_index");
+            print_data(object, "prev_data");
+
+            print_member(object, "remotes");
+        } else {
+            print_member(object, "term");
+            print_member(object, "index");
+            print_data(object, "data");
+            print_servers(object, "servers");
+            print_uuid(object, "vote");
+        }
+        json_destroy(json);
+        putchar('\n');
+    }
+
+    ovsdb_schema_destroy(schema);
+    /* XXX free 'names'. */
+}
+
+static void
+do_show_log(struct ovs_cmdl_context *ctx)
+{
+    const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db();
+    struct ovsdb_log *log;
+
+    check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC"|"RAFT_MAGIC,
+                                     OVSDB_LOG_READ_ONLY, -1, &log));
+    if (!strcmp(ovsdb_log_get_magic(log), OVSDB_MAGIC)) {
+        do_show_log_standalone(log);
+    } else {
+        do_show_log_cluster(log);
+    }
+    ovsdb_log_close(log);
+}
+
+static void
 do_help(struct ovs_cmdl_context *ctx OVS_UNUSED)
 {
     usage();
@@ -611,12 +992,18 @@  do_list_commands(struct ovs_cmdl_context *ctx OVS_UNUSED)
 
 static const struct ovs_cmdl_command all_commands[] = {
     { "create", "[db [schema]]", 0, 2, do_create, OVS_RW },
+    { "create-cluster", "db contents local", 3, 3, do_create_cluster, OVS_RW },
+    { "join-cluster", "db name local remote...", 4, INT_MAX, do_join_cluster,
+      OVS_RW },
     { "compact", "[db [dst]]", 0, 2, do_compact, OVS_RW },
     { "convert", "[db [schema [dst]]]", 0, 3, do_convert, OVS_RW },
     { "needs-conversion", NULL, 0, 2, do_needs_conversion, OVS_RO },
     { "db-name", "[db]",  0, 1, do_db_name, OVS_RO },
     { "db-version", "[db]",  0, 1, do_db_version, OVS_RO },
     { "db-cksum", "[db]", 0, 1, do_db_cksum, OVS_RO },
+    { "db-cid", "db", 1, 1, do_db_cid, OVS_RO },
+    { "db-sid", "db", 1, 1, do_db_sid, OVS_RO },
+    { "db-local-address", "db", 1, 1, do_db_local_address, OVS_RO },
     { "schema-name", "[schema]", 0, 1, do_schema_name, OVS_RO },
     { "schema-version", "[schema]", 0, 1, do_schema_version, OVS_RO },
     { "schema-cksum", "[schema]", 0, 1, do_schema_cksum, OVS_RO },
diff --git a/ovsdb/ovsdb.5.xml b/ovsdb/ovsdb.5.xml
index 2f40acea03cb..77ca3035ab93 100644
--- a/ovsdb/ovsdb.5.xml
+++ b/ovsdb/ovsdb.5.xml
@@ -7,7 +7,10 @@ 
   <p>
     OVSDB, the Open vSwitch Database, is a database system whose network
     protocol is specified by RFC 7047.  The RFC does not specify an on-disk
-    storage format. This manpage documents the format used by Open vSwitch.
+    storage format.  The OVSDB implementation in Open vSwitch implements two
+    storage formats: one for standalone (and active-backup) databases, and the
+    other for clustered databases.  This manpage documents both of these
+    formats.
   </p>
 
   <p>
@@ -29,18 +32,22 @@ 
     OVSDB files do not include the values of ephemeral columns.
   </p>
 
+  <h2>Common Features</h2>
+
   <p>
-    Database files are text files encoded in UTF-8 with LF (U+000A) line ends,
-    organized as append-only series of records.  Each record consists of 2
-    lines of text.
+    Standalone and clustered database files share the common structure
+    described here.  They are text files encoded in UTF-8 with LF (U+000A) line
+    ends, organized as append-only series of records.  Each record consists of
+    2 lines of text.
   </p>
 
   <p>
-    The first line in each record has the format <code>OVSDB JSON
-    <var>length</var> <var>hash</var></code>, where <var>length</var> is a
-    positive decimal integer and <var>hash</var> is a SHA-1 checksum expressed
-    as 40 hexadecimal digits.  Words in the first line must be separated by
-    exactly one space.
+    The first line in each record has the format <code>OVSDB <var>magic</var>
+    <var>length</var> <var>hash</var></code>, where <var>magic</var> is
+    <code>JSON</code> for standalone databases or <code>CLUSTER</code> for
+    clustered databases, <var>length</var> is a positive decimal integer, and
+    <var>hash</var> is a SHA-1 checksum expressed as 40 hexadecimal digits.
+    Words in the first line must be separated by exactly one space.
   </p>
 
   <p>
@@ -148,4 +155,198 @@ 
     </dd>
   </dl>
 
+  <h2>Clustered Format</h2>
+
+  <p>
+    The clustered format has the following additional notation:
+  </p>
+
+  <dl>
+    <dt>&lt;uint64&gt;</dt>
+    <dd>
+      A JSON integer that represents a 64-bit unsigned integer.  The OVS JSON
+      implementation only supports integers in the range -2**63 through
+      2**63-1, so 64-bit unsigned integer values from 2**63 through 2**64-1 are
+      expressed as negative numbers.
+    </dd>
+    
+    <dt>&lt;address&gt;</dt>
+    <dd>
+      A JSON string that represents a network address to support clustering, in
+      the <code><var>protocol</var>:<var>ip</var>:<var>port</var></code> syntax
+      described in <code>ovsdb-tool</code>(1).
+    </dd>
+
+    <dt>&lt;servers&gt;</dt>
+    <dd>
+      A JSON object whose names are &lt;raw-uuid&gt;s that identify servers and
+      whose values are &lt;address&gt;es that specify those servers' addresses.
+    </dd>
+
+    <dt>&lt;cluster-txn&gt;</dt>
+    <dd>
+      <p>
+        A JSON array with two elements:
+      </p>
+
+      <ul>
+        <li>
+          The first element is either a &lt;database-schema&gt; or null.  It is
+          always present in the first of a clustered database to indicate the
+          database's initial schema.  If it is present in a later record, it
+          indicates a change of schema for the database.
+        </li>
+
+        <li>
+          The second element is either a transaction record in the format
+          described under ``Transaction Records'' above, or null.
+        </li>
+      </ul>
+
+      <p>
+        When a schema is present, the transaction record is relative to an
+        empty database.  That is, a schema change effectively resets the
+        database to empty and the transaction record represents the full
+        database contents.  This allows readers to be ignorant of the full
+        semantics of schema change.
+      </p>
+    </dd>
+  </dl>
+
+  <p>
+    The first record in a clustered database contains the following members,
+    all of which are required:
+  </p>
+
+  <dl>
+    <dt>"server_id": &lt;raw-uuid&gt;</dt>
+    <dd>
+      The server's own UUID, which must be unique within the cluster.
+    </dd>
+
+    <dt>"local_address": &lt;address&gt;</dt>
+    <dd>
+      The address on which the server listens for connections from other
+      servers in the cluster.
+    </dd>
+
+    <dt>"name": &lt;id&gt;</dt>
+    <dd>
+      The database schema name.  It is only important when a server is in the
+      process of a joining a cluster: a server will only join a cluster if the
+      name matches.  (If the database schema name were unique, then we would
+      not also need a cluster ID.)
+    </dd>
+
+    <dt>"cluster_id": &lt;raw-uuid&gt;</dt>
+    <dd>
+      The cluster's UUID.  The all-zeros UUID is not a valid cluster ID.
+    </dd>
+
+    <dt>"prev_term": &lt;uint64&gt;</dt>
+    <dt>"prev_index": &lt;uint64&gt;</dt>
+    <dd>
+      The Raft term and index just before the beginning of the log.
+    </dd>
+
+    <dt>"prev_servers": &lt;servers&gt;</dt>
+    <dd>
+      The set of one or more servers in the cluster at index "prev_index" and
+      term "prev_term".  It might not include this server, if it was not the
+      initial server in the cluster.
+    </dd>
+
+    <dt>"prev_data": &lt;json-value&gt;</dt>
+    <dt>"prev_eid": &lt;raw-uuid&gt;</dt>
+    <dd>
+      A snapshot of the data in the database at index "prev_index" and term
+      "prev_term", and the entry ID for that data.  The snapshot must contain a
+      schema.
+    </dd>
+  </dl>
+
+  <p>
+    The second and subsequent records, if present, in a clustered database
+    represent changes to the database, to the cluster state, or both.  They
+    have the following members:
+  </p>
+
+  <dl>
+    <dt>"term": &lt;uint64&gt;</dt>
+    <dd>
+      Always present.
+    </dd>
+
+    <dt>"index": &lt;uint64&gt;</dt>
+    <dd>
+      Optional.  If this member is present, exactly one of <code>data</code> or
+      <code>servers</code> is also present, and <code>vote</code> must be
+      omitted.
+    </dd>
+
+    <dt>"data": &lt;json-value&gt;</dt>
+    <dt>"eid": &lt;raw-uuid&gt;</dt>
+    <dd>
+      Optional, but if either is present then both must be.  If these members
+      are present, <code>index</code> must also be present.
+    </dd>
+
+    <dt>"servers": &lt;servers&gt;</dt>
+    <dd>
+      Optional.  If this member is present, <code>index</code> must also be
+      present.
+    </dd>
+
+    <dt>"vote": &lt;raw-uuid&gt;</dt>
+    <dd>
+      Optional.  If this member is present, <code>index</code>,
+      <code>data</code>, <code>eid</code>, and <code>servers</code> must be
+      omitted.
+    </dd>
+  </dl>
+
+  <p>
+    When a server completes leaving its cluster (e.g. following the
+    <code>cluster/leave</code> command to <code>ovsdb-server</code>), it writes
+    a final record to its database that has the single member <code>"left":
+    true</code>.  This lets subsequent readers know that the server is not part
+    of the cluster and should not attempt to connect to it.
+  </p>
+
+  <h3>Joining a Cluster</h3>
+
+  <p>
+    In addition to general format for a clustered database, there is also a
+    special case for a database file created by <code>ovsdb-tool
+    join-cluster</code>.  Such a file contains exactly one record, which
+    conveys the information passed to the <code>join-cluster</code> command.
+    It has the following members:
+  </p>
+
+  <dl>
+    <dt>"server_id": &lt;raw-uuid&gt;</dt>
+    <dt>"local_address": &lt;address&gt;</dt>
+    <dt>"name": &lt;id&gt;</dt>
+    <dd>
+      These have the same semantics described above in the general description
+      of the format.
+    </dd>
+
+    <dt>"cluster_id": &lt;raw-uuid&gt;</dt>
+    <dd>
+      This is provided only if the user gave the <code>--cid</code> option to
+      <code>join-cluster</code>.  It has the same semantics described above.
+    </dd>
+
+    <dt>"remote_addresses"; [&lt;address&gt;*]</dt>
+    <dd>
+      One or more remote servers to contact for joining the cluster.
+    </dd>
+  </dl>
+
+  <p>
+    When the server successfully joins the cluster, the database file is
+    replaced by one in the general format described earlier.
+  </p>
+
 </manpage>
diff --git a/ovsdb/ovsdb.7.xml b/ovsdb/ovsdb.7.xml
index 8169120c88f2..8f0699641ee9 100644
--- a/ovsdb/ovsdb.7.xml
+++ b/ovsdb/ovsdb.7.xml
@@ -124,9 +124,14 @@ 
   <h1>Service Models</h1>
 
   <p>
-    OVSDB supports two service models for databases: <dfn>standalone</dfn>, and
-    <dfn>active-backup</dfn>.  The service models provide different compromises
-    among consistency and availability.
+    OVSDB supports three service models for databases: <dfn>standalone</dfn>,
+    <dfn>active-backup</dfn>, and <dfn>clustered</dfn>.  The service models
+    provide different compromises among consistency, availability, and
+    partition tolerance.  They also differ in the number of servers required
+    and in terms of performance.  The standalone and active-backup database
+    service models share one on-disk format, and clustered databases use a
+    different format, but the OVSDB programs work with both formats.
+    <code>ovsdb</code>(5) documents these file formats.
   </p>
 
   <p>
@@ -212,13 +217,95 @@ 
   <p>
     Compared to a standalone server, the active-backup service model
     somewhat increases availability, at a risk of split-brain.  It adds
-    generally insignificant performance overhead.
+    generally insignificant performance overhead.  On the other hand, the
+    clustered service model, discussed below, requires at least 3 servers
+    and has greater performance overhead, but it avoids the need for
+    external management software and eliminates the possibility of
+    split-brain.
   </p>
 
   <p>
     Open vSwitch 2.6 introduced support for the active-backup service model.
   </p>
 
+  <h3>Clustered Database Service Model</h3>
+
+  <p>
+    A <dfn>clustered</dfn> database runs across 3 or 5 database servers (the
+    <dfn>cluster</dfn>) on different hosts.  Servers in a cluster automatically
+    synchronize writes within the cluster.  A 3-server cluster can remain
+    available in the face of at most 1 server failure; a 5-server cluster
+    tolerates up to 2 failures.
+  </p>
+
+  <p>
+    Clusters larger than 5 servers will also work, with every 2 added
+    servers allowing the cluster to tolerate 1 more failure, but
+    performance decreases, especially write performance.  The number of
+    servers should be odd: a 4- or 6-server cluster cannot tolerate more
+    failures than a 3- or 5-server cluster, respectively.
+  </p>
+
+  <p>
+    To set up a clustered database, first initialize it on a single node by
+    running <code>ovsdb-tool create-cluster</code>, then start
+    <code>ovsdb-server</code>.  Depending on its arguments, the
+    <code>create-cluster</code> command can create an empty database or copy a
+    standalone database's contents into the new database.
+  </p>
+
+  <p>
+    To add a server to a cluster, run <code>ovsdb-tool join-cluster</code> on
+    the new server and start <code>ovsdb-server</code>.  To remove a running
+    server from a cluster, use <code>ovs-appctl</code> to invoke the
+    <code>cluster/leave</code> command.  When a server fails and cannot be
+    recovered, e.g. because its hard disk crashed, or to otherwise remove a
+    server that is down from a cluster, use <code>ovs-appctl</code> to invoke
+    <code>cluster/kick</code> to make the remaining servers kick it out of the
+    cluster.
+  </p>
+
+  <p>
+    The above methods for adding and removing servers only work for healthy
+    clusters, that is, for clusters with no more failures than their maximum
+    tolerance.  For example, in a 3-server cluster, the failure of 2 servers
+    prevents servers joining or leaving the cluster (as well as database
+    access).  To prevent data loss or inconsistency, the preferred solution to
+    this problem is to bring up enough of the failed servers to make the
+    cluster healthy again, then if necessary remove any remaining failed
+    servers and add new ones.  If this cannot be done, though, use
+    <code>ovs-appctl</code> to invoke <code>cluster/leave --force</code> on a
+    running server.  This command forces the server to which it is directed to
+    leave its cluster and form a new single-node cluster that contains only
+    itself.  The data in the new cluster may be inconsistent with the former
+    cluster: transactions not yet replicated to the server will be lost, and
+    transactions not yet applied to the cluster may be committed.  Afterward,
+    any servers in its former cluster will regard the server to have failed.
+  </p>
+
+  <p>
+    The servers in a cluster synchronize data over a cluster management
+    protocol that is specific to Open vSwitch; it is not the same as the OVSDB
+    protocol specified in RFC 7047.  For this purpose, a server in a cluster is
+    tied to a particular IP address and TCP port, which is specified in the
+    <code>ovsdb-tool</code> command that creates or joins the cluster.  The TCP
+    port used for clustering must be different from that used for OVSDB
+    clients.  To change the port or address of a server in a cluster, first
+    remove it from the cluster, then add it back with the new address.
+  </p>
+
+  <p>
+    To upgrade the <code>ovsdb-server</code> processes in a cluster from one
+    version of Open vSwitch to another, upgrading them one at a time will keep
+    the cluster healthy during the upgrade process.  (This is different from
+    upgrading a database schema, which is covered later under <ref
+    section="Upgrading or Downgrading a Database"/>.)
+  </p>
+
+  <p>
+    Open vSwitch 2.9 introduced support for the clustered service model.
+  </p>
+
   <h2>Database Replication</h2>
 
   <p>
@@ -299,6 +386,25 @@ 
         domain socket.
       </p>
     </dd>
+
+    <dt><var>method1</var><code>, </code><var>method2</var><code>, </code>...<code>, </code><var>methodN</var></dt>
+    <dd>
+      <p>
+        For a clustered database service to be highly available, a client must
+        be able to connect to any of the servers in the cluster.  To do so,
+        specify connection methods for each of the servers separated by commas
+        (and optional spaces).
+      </p>
+
+      <p>
+        In theory, if machines go up and down and IP addresses change in the
+        right way, a client could talk to the wrong instance of a database.  To
+        avoid this possibility, add <code>cid:<var>uuid</var></code> in the
+        list of connection methods, where <var>uuid</var> is the cluster ID of
+        the desired database cluster, as printed by <code>ovsdb-tool
+        get-cid</code>.  This feature is optional.
+      </p>
+    </dd>
   </dl>
 
   <p>
@@ -400,20 +506,23 @@ 
     For the standalone and active-backup service models, making a copy of the
     database file, e.g. using <code>cp</code>, effectively makes a snapshot,
     and because OVSDB database files are append-only, it works even if the
-    database is being modified when the snapshot takes place.
+    database is being modified when the snapshot takes place.  This approach
+    does not work for clustered databases.
   </p>
 
   <p>
-    Another way to make a backup is to use <code>ovsdb-client backup</code>,
-    which connects to a running database server and outputs an atomic snapshot
-    of its schema and content, in the same format used for on-disk databases.
+    Another way to make a backup, which works with all OVSDB service models, is
+    to use <code>ovsdb-client backup</code>, which connects to a running
+    database server and outputs an atomic snapshot of its schema and content,
+    in the same format used for standalone and active-backup databases.
   </p>
 
   <p>
     Multiple options are also available when the time comes to restore a
-    database from a backup.  One option is to stop the database server or
-    servers, overwrite the database file with the backup (e.g. with
-    <code>cp</code>), and then restart the servers.  Another way is to use
+    database from a backup.  For the standalone and active-backup service
+    models, one option is to stop the database server or servers, overwrite the
+    database file with the backup (e.g. with <code>cp</code>), and then restart
+    the servers.  Another way, which works with any service model, is to use
     <code>ovsdb-client restore</code>, which connects to a running database
     server and replaces the data in one of its databases by a provided
     snapshot.  Using <code>ovsdb-client restore</code> has the disadvantage
@@ -429,6 +538,21 @@ 
     when a database server restarts.
   </p>
 
+  <p>
+    Clustering and backup serve different purposes.  Clustering increases
+    availability, but it does not protect against data loss if, for example, a
+    malicious or malfunctioning OVSDB client deletes or tampers with data.
+  </p>
+
+  <h2>Changing Database Service Model</h2>
+
+  <p>
+    Use <code>ovsdb-tool create-cluster</code> to create a clustered database
+    from the contents of a standalone database.  Use <code>ovsdb-tool
+    backup</code> to create a standalone database from the contents of a
+    clustered database.
+  </p>
+
   <h2>Upgrading or Downgrading a Database</h2>
 
   <p>
@@ -520,8 +644,8 @@ 
     process).  To compact any database that is currently being served by
     <code>ovsdb-server</code>, use <code>ovs-appctl</code> to send the
     <code>ovsdb-server/compact</code> command.  Each server in an active-backup
-    database maintains its database file independently, so to compact all of
-    them, issue this command separately on each server.
+    or clustered database maintains its database file independently, so to
+    compact all of them, issue this command separately on each server.
   </p>
 
   <h3>Viewing History</h3>
@@ -540,9 +664,9 @@ 
 
   <p>
     The <code>show-log</code> command works with both OVSDB file formats, but
-    the details of the output format differ.  For active-backup databases, the
-    sequence of transactions in each server's log will differ, even at points
-    when they reflect the same data.
+    the details of the output format differ.  For active-backup and clustered
+    databases, the sequence of transactions in each server's log will differ,
+    even at points when they reflect the same data.
   </p>
 
   <h3>Truncating History</h3>
@@ -578,7 +702,8 @@ 
   <p>
     The first record in a standalone or active-backup database file specifies
     the schema.  <code>ovsdb-server</code> will refuse to work with such a
-    database.  Delete and recreate such a database, or restore it from a
+    database, or with a clustered database file with corruption in the first
+    few records.  Delete and recreate such a database, or restore it from a
     backup.
   </p>
 
diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
index 89f530bcccfb..dd8b9d1499ae 100644
--- a/ovsdb/ovsdb.c
+++ b/ovsdb/ovsdb.c
@@ -1,4 +1,4 @@ 
-/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,14 @@ 
 #include "ovsdb-parser.h"
 #include "ovsdb-types.h"
 #include "simap.h"
+#include "storage.h"
 #include "table.h"
 #include "transaction.h"
 #include "trigger.h"
 
+#include "openvswitch/vlog.h"
+VLOG_DEFINE_THIS_MODULE(ovsdb);
+
 struct ovsdb_schema *
 ovsdb_schema_create(const char *name, const char *version, const char *cksum)
 {
@@ -323,35 +327,41 @@  ovsdb_set_ref_table(const struct shash *tables,
     }
 }
 
+/* XXX add prereq parameter? */
 struct ovsdb *
-ovsdb_create(struct ovsdb_schema *schema)
+ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage)
 {
     struct shash_node *node;
     struct ovsdb *db;
 
-    db = xmalloc(sizeof *db);
+    db = xzalloc(sizeof *db);
+    db->name = xstrdup(schema
+                       ? schema->name
+                       : ovsdb_storage_get_name(storage));
     db->schema = schema;
-    db->file = NULL;
+    db->storage = storage;
     ovs_list_init(&db->monitors);
     ovs_list_init(&db->triggers);
     db->run_triggers = false;
 
     shash_init(&db->tables);
-    SHASH_FOR_EACH (node, &schema->tables) {
-        struct ovsdb_table_schema *ts = node->data;
-        shash_add(&db->tables, node->name, ovsdb_table_create(ts));
-    }
+    if (schema) {
+        SHASH_FOR_EACH (node, &schema->tables) {
+            struct ovsdb_table_schema *ts = node->data;
+            shash_add(&db->tables, node->name, ovsdb_table_create(ts));
+        }
 
-    /* Set all the refTables. */
-    SHASH_FOR_EACH (node, &schema->tables) {
-        struct ovsdb_table_schema *table = node->data;
-        struct shash_node *node2;
+        /* Set all the refTables. */
+        SHASH_FOR_EACH (node, &schema->tables) {
+            struct ovsdb_table_schema *table = node->data;
+            struct shash_node *node2;
 
-        SHASH_FOR_EACH (node2, &table->columns) {
-            struct ovsdb_column *column = node2->data;
+            SHASH_FOR_EACH (node2, &table->columns) {
+                struct ovsdb_column *column = node2->data;
 
-            ovsdb_set_ref_table(&db->tables, &column->type.key);
-            ovsdb_set_ref_table(&db->tables, &column->type.value);
+                ovsdb_set_ref_table(&db->tables, &column->type.key);
+                ovsdb_set_ref_table(&db->tables, &column->type.value);
+            }
         }
     }
 
@@ -362,38 +372,13 @@  ovsdb_create(struct ovsdb_schema *schema)
 }
 
 void
-ovsdb_replace(struct ovsdb *dst, struct ovsdb *src)
-{
-    /* Cancel monitors. */
-    ovsdb_monitor_prereplace_db(dst);
-
-    /* Cancel triggers. */
-    struct ovsdb_trigger *trigger, *next;
-    LIST_FOR_EACH_SAFE (trigger, next, node, &dst->triggers) {
-        ovsdb_trigger_prereplace_db(trigger);
-    }
-
-    struct ovsdb_schema *tmp_schema = dst->schema;
-    dst->schema = src->schema;
-    src->schema = tmp_schema;
-
-    shash_swap(&dst->tables, &src->tables);
-
-    dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role");
-
-    ovsdb_destroy(src);
-}
-
-void
 ovsdb_destroy(struct ovsdb *db)
 {
     if (db) {
         struct shash_node *node;
 
         /* Close the log. */
-        if (db->file) {
-            ovsdb_file_destroy(db->file);
-        }
+        ovsdb_storage_close(db->storage);
 
         /* Remove all the monitors. */
         ovsdb_monitors_remove(db);
@@ -408,9 +393,11 @@  ovsdb_destroy(struct ovsdb *db)
         /* The schemas, but not the table that points to them, were deleted in
          * the previous step, so we need to clear out the table.  We can't
          * destroy the table, because ovsdb_schema_destroy() will do that. */
-        shash_clear(&db->schema->tables);
+        if (db->schema) {
+            shash_clear(&db->schema->tables);
+            ovsdb_schema_destroy(db->schema);
+        }
 
-        ovsdb_schema_destroy(db->schema);
         free(db);
     }
 }
@@ -420,6 +407,10 @@  ovsdb_destroy(struct ovsdb *db)
 void
 ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage)
 {
+    if (!db->schema) {
+        return;
+    }
+
     const struct shash_node *node;
     unsigned int cells = 0;
 
@@ -439,3 +430,42 @@  ovsdb_get_table(const struct ovsdb *db, const char *name)
 {
     return shash_find_data(&db->tables, name);
 }
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_snapshot(struct ovsdb *db)
+{
+    if (!db->storage) {
+        return NULL;
+    }
+
+    struct json *schema = ovsdb_schema_to_json(db->schema);
+    struct json *data = ovsdb_to_txn_json(db, "compacting database online");
+    struct ovsdb_error *error = ovsdb_storage_store_snapshot(db->storage,
+                                                             schema, data);
+    json_destroy(schema);
+    json_destroy(data);
+    return error;
+}
+
+void
+ovsdb_replace(struct ovsdb *dst, struct ovsdb *src)
+{
+    /* Cancel monitors. */
+    ovsdb_monitor_prereplace_db(dst);
+
+    /* Cancel triggers. */
+    struct ovsdb_trigger *trigger, *next;
+    LIST_FOR_EACH_SAFE (trigger, next, node, &dst->triggers) {
+        ovsdb_trigger_prereplace_db(trigger);
+    }
+
+    struct ovsdb_schema *tmp_schema = dst->schema;
+    dst->schema = src->schema;
+    src->schema = tmp_schema;
+
+    shash_swap(&dst->tables, &src->tables);
+
+    dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role");
+
+    ovsdb_destroy(src);
+}
diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h
index c3e8f2091e35..1e005a2c09ba 100644
--- a/ovsdb/ovsdb.h
+++ b/ovsdb/ovsdb.h
@@ -1,4 +1,4 @@ 
-/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,13 @@ 
 #include "openvswitch/hmap.h"
 #include "openvswitch/list.h"
 #include "openvswitch/shash.h"
+#include "openvswitch/uuid.h"
 
 struct json;
 struct ovsdb_log;
 struct ovsdb_session;
 struct ovsdb_txn;
 struct simap;
-struct uuid;
 
 /* Database schema. */
 struct ovsdb_schema {
@@ -54,9 +54,16 @@  bool ovsdb_schema_equal(const struct ovsdb_schema *,
                         const struct ovsdb_schema *);
 
 /* Database. */
+enum ovsdb_state {
+    OVSDB_LOADING,
+    OVSDB_RUNNING
+};
+
 struct ovsdb {
+    char *name;
     struct ovsdb_schema *schema;
-    struct ovsdb_file *file;    /* If nonnull, log for transactions. */
+    struct ovsdb_storage *storage; /* If nonnull, log for transactions. */
+    struct uuid prereq;
     struct ovs_list monitors;   /* Contains "struct ovsdb_monitor"s. */
     struct shash tables;        /* Contains "struct ovsdb_table *"s. */
 
@@ -67,18 +74,27 @@  struct ovsdb {
     struct ovsdb_table *rbac_role;
 };
 
-struct ovsdb *ovsdb_create(struct ovsdb_schema *);
-void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src);
+struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *);
 void ovsdb_destroy(struct ovsdb *);
 
 void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage);
 
 struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *);
 
+struct ovsdb_txn *ovsdb_execute_compose(
+    struct ovsdb *, const struct ovsdb_session *, const struct json *params,
+    bool read_only, const char *role, const char *id,
+    long long int elapsed_msec, long long int *timeout_msec,
+    bool *durable, struct json **);
+
 struct json *ovsdb_execute(struct ovsdb *, const struct ovsdb_session *,
                            const struct json *params, bool read_only,
                            const char *role, const char *id,
                            long long int elapsed_msec,
                            long long int *timeout_msec);
 
+struct ovsdb_error *ovsdb_snapshot(struct ovsdb *) OVS_WARN_UNUSED_RESULT;
+
+void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src);
+
 #endif /* ovsdb/ovsdb.h */
diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c
new file mode 100644
index 000000000000..3eb4d54c9252
--- /dev/null
+++ b/ovsdb/raft-private.c
@@ -0,0 +1,358 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "raft-private.h"
+
+#include "openvswitch/dynamic-string.h"
+#include "ovsdb-error.h"
+#include "ovsdb-parser.h"
+#include "socket-util.h"
+#include "sset.h"
+
+/* Addresses of Raft servers. */
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_address_validate(const char *address)
+{
+    if (!strncmp(address, "unix:", 5)) {
+        return NULL;
+    } else if (!strncmp(address, "ssl:", 4) || !strncmp(address, "tcp:", 4)) {
+        struct sockaddr_storage ss;
+        if (!inet_parse_active(address + 4, 0, &ss)) {
+            return ovsdb_error(NULL, "%s: syntax error in address", address);
+        }
+        return NULL;
+    } else {
+        return ovsdb_error(NULL, "%s: expected \"tcp\" or \"ssl\" address",
+                           address);
+    }
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_address_validate_json(const struct json *address)
+{
+    if (address->type != JSON_STRING) {
+        return ovsdb_syntax_error(address, NULL,
+                                  "server address is not string");
+    }
+    return raft_address_validate(json_string(address));
+}
+
+/* Sets of Raft server addresses. */
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_addresses_from_json(const struct json *json, struct sset *addresses)
+{
+    sset_init(addresses);
+
+    const struct json_array *array = json_array(json);
+    if (!array->n) {
+        return ovsdb_syntax_error(json, NULL,
+                                  "at least one remote address is required");
+    }
+    for (size_t i = 0; i < array->n; i++) {
+        const struct json *address = array->elems[i];
+        struct ovsdb_error *error = raft_address_validate_json(address);
+        if (error) {
+            return error;
+        }
+        sset_add(addresses, json_string(address));
+    }
+    return NULL;
+}
+
+struct json *
+raft_addresses_to_json(const struct sset *sset)
+{
+    struct json *array;
+    const char *s;
+
+    array = json_array_create_empty();
+    SSET_FOR_EACH (s, sset) {
+        json_array_add(array, json_string_create(s));
+    }
+    return array;
+}
+
+/* raft_server. */
+
+const char *
+raft_server_phase_to_string(enum raft_server_phase phase)
+{
+    switch (phase) {
+    case RAFT_PHASE_STABLE: return "stable";
+    case RAFT_PHASE_CATCHUP: return "adding: catchup";
+    case RAFT_PHASE_CAUGHT_UP: return "adding: caught up";
+    case RAFT_PHASE_COMMITTING: return "adding: committing";
+    case RAFT_PHASE_REMOVE: return "removing";
+    default: return "<error>";
+    }
+}
+
+void
+raft_server_destroy(struct raft_server *s)
+{
+    if (s) {
+        free(s->address);
+        free(s);
+    }
+}
+
+void
+raft_servers_destroy(struct hmap *servers)
+{
+    struct raft_server *s, *next;
+    HMAP_FOR_EACH_SAFE (s, next, hmap_node, servers) {
+        hmap_remove(servers, &s->hmap_node);
+        raft_server_destroy(s);
+    }
+    hmap_destroy(servers);
+}
+
+struct raft_server *
+raft_server_add(struct hmap *servers, const struct uuid *sid,
+                const char *address)
+{
+    struct raft_server *s = xzalloc(sizeof *s);
+    s->sid = *sid;
+    s->address = xstrdup(address);
+    hmap_insert(servers, &s->hmap_node, uuid_hash(sid));
+    return s;
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_servers_from_json__(const struct json *json, struct hmap *servers)
+{
+    if (!json || json->type != JSON_OBJECT) {
+        return ovsdb_syntax_error(json, NULL, "servers must be JSON object");
+    } else if (shash_is_empty(json_object(json))) {
+        return ovsdb_syntax_error(json, NULL, "must have at least one server");
+    }
+
+    /* Parse new servers. */
+    struct shash_node *node;
+    SHASH_FOR_EACH (node, json_object(json)) {
+        /* Parse server UUID. */
+        struct uuid sid;
+        if (!uuid_from_string(&sid, node->name)) {
+            return ovsdb_syntax_error(json, NULL, "%s is a not a UUID",
+                                      node->name);
+        }
+
+        const struct json *address = node->data;
+        struct ovsdb_error *error = raft_address_validate_json(address);
+        if (error) {
+            return error;
+        }
+
+        raft_server_add(servers, &sid, json_string(address));
+    }
+
+    return NULL;
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_servers_from_json(const struct json *json, struct hmap *servers)
+{
+    hmap_init(servers);
+    struct ovsdb_error *error = raft_servers_from_json__(json, servers);
+    if (error) {
+        raft_servers_destroy(servers);
+    }
+    return error;
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_servers_validate_json(const struct json *json)
+{
+    struct hmap servers = HMAP_INITIALIZER(&servers);
+    struct ovsdb_error *error = raft_servers_from_json__(json, &servers);
+    raft_servers_destroy(&servers);
+    return error;
+}
+
+struct json *
+raft_servers_to_json(const struct hmap *servers)
+{
+    struct json *json = json_object_create();
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, servers) {
+        char sid_s[UUID_LEN + 1];
+        sprintf(sid_s, UUID_FMT, UUID_ARGS(&s->sid));
+        json_object_put_string(json, sid_s, s->address);
+    }
+    return json;
+}
+
+void
+raft_servers_format(const struct hmap *servers, struct ds *ds)
+{
+    int i = 0;
+    const struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, servers) {
+        if (i++) {
+            ds_put_cstr(ds, ", ");
+        }
+        ds_put_format(ds, SID_FMT"(%s)", SID_ARGS(&s->sid), s->address);
+    }
+}
+
+/* Raft log entries. */
+
+void
+raft_entry_destroy(struct raft_entry *e)
+{
+    if (e) {
+        json_destroy(e->data);
+        json_destroy(e->servers);
+    }
+}
+
+struct json *
+raft_entry_to_json(const struct raft_entry *e)
+{
+    struct json *json = json_object_create();
+    raft_put_uint64(json, "term", e->term);
+    if (e->data) {
+        json_object_put(json, "data", json_clone(e->data));
+        json_object_put_format(json, "eid", UUID_FMT, UUID_ARGS(&e->eid));
+    }
+    if (e->servers) {
+        json_object_put(json, "servers", json_clone(e->servers));
+    }
+    return json;
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_entry_from_json(struct json *json, struct raft_entry *e)
+{
+    memset(e, 0, sizeof *e);
+
+    struct ovsdb_parser p;
+    ovsdb_parser_init(&p, json, "raft log entry");
+    e->term = raft_parse_uint64(&p, "term");
+    e->data = json_nullable_clone(
+        ovsdb_parser_member(&p, "data", OP_OBJECT | OP_ARRAY | OP_OPTIONAL));
+    e->eid = e->data ? raft_parse_required_uuid(&p, "eid") : UUID_ZERO;
+    e->servers = json_nullable_clone(
+        ovsdb_parser_member(&p, "servers", OP_OBJECT | OP_OPTIONAL));
+    if (e->servers) {
+        ovsdb_parser_put_error(&p, raft_servers_validate_json(e->servers));
+    }
+
+    struct ovsdb_error *error = ovsdb_parser_finish(&p);
+    if (error) {
+        raft_entry_destroy(e);
+    }
+    return error;
+}
+
+/* Puts 'integer' into JSON 'object' with the given 'name'.
+ *
+ * The OVS JSON implementation only supports integers in the range
+ * INT64_MIN...INT64_MAX, which causes trouble for values from INT64_MAX+1 to
+ * UINT64_MAX.  We map those into the negative range. */
+void
+raft_put_uint64(struct json *object, const char *name, uint64_t integer)
+{
+    json_object_put(object, name, json_integer_create(integer));
+}
+
+/* Parses an integer from parser 'p' with the given 'name'.
+ *
+ * The OVS JSON implementation only supports integers in the range
+ * INT64_MIN...INT64_MAX, which causes trouble for values from INT64_MAX+1 to
+ * UINT64_MAX.  We map the negative range back into positive numbers. */
+uint64_t
+raft_parse_uint64(struct ovsdb_parser *p, const char *name)
+{
+    const struct json *json = ovsdb_parser_member(p, name, OP_INTEGER);
+    return json ? json_integer(json) : 0;
+}
+
+static int
+raft_parse_boolean__(struct ovsdb_parser *p, const char *name, bool optional)
+{
+    enum ovsdb_parser_types types = OP_BOOLEAN | (optional ? OP_OPTIONAL : 0);
+    const struct json *json = ovsdb_parser_member(p, name, types);
+    return json ? json_boolean(json) : -1;
+}
+
+bool
+raft_parse_required_boolean(struct ovsdb_parser *p, const char *name)
+{
+    return raft_parse_boolean__(p, name, false);
+}
+
+/* Returns true or false if present, -1 if absent. */
+int
+raft_parse_optional_boolean(struct ovsdb_parser *p, const char *name)
+{
+    return raft_parse_boolean__(p, name, true);
+}
+
+static const char *
+raft_parse_string__(struct ovsdb_parser *p, const char *name, bool optional)
+{
+    enum ovsdb_parser_types types = OP_STRING | (optional ? OP_OPTIONAL : 0);
+    const struct json *json = ovsdb_parser_member(p, name, types);
+    return json ? json_string(json) : NULL;
+}
+
+const char *
+raft_parse_required_string(struct ovsdb_parser *p, const char *name)
+{
+    return raft_parse_string__(p, name, false);
+}
+
+const char *
+raft_parse_optional_string(struct ovsdb_parser *p, const char *name)
+{
+    return raft_parse_string__(p, name, true);
+}
+
+bool
+raft_parse_uuid__(struct ovsdb_parser *p, const char *name, bool optional,
+             struct uuid *uuid)
+{
+    const char *s = raft_parse_string__(p, name, optional);
+    if (s) {
+        if (uuid_from_string(uuid, s)) {
+            return true;
+        }
+        ovsdb_parser_raise_error(p, "%s is not a valid UUID", name);
+    }
+    *uuid = UUID_ZERO;
+    return false;
+}
+
+struct uuid
+raft_parse_required_uuid(struct ovsdb_parser *p, const char *name)
+{
+    struct uuid uuid;
+    raft_parse_uuid__(p, name, false, &uuid);
+    return uuid;
+}
+
+bool
+raft_parse_optional_uuid(struct ovsdb_parser *p, const char *name,
+                    struct uuid *uuid)
+{
+    return raft_parse_uuid__(p, name, true, uuid);
+}
+
diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h
new file mode 100644
index 000000000000..81cf6d224a6b
--- /dev/null
+++ b/ovsdb/raft-private.h
@@ -0,0 +1,123 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RAFT_PRIVATE_H
+#define RAFT_PRIVATE_H 1
+
+/* Data structures for use internally within the Raft implementation. */
+
+#include "raft.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/uuid.h"
+#include <stdint.h>
+
+struct ds;
+struct ovsdb_parser;
+struct sset;
+
+/* Formatting server IDs and cluster IDs for use in human-readable logs.  Do
+ * not use these in cases where the whole server or cluster ID is needed; use
+ * UUID_FMT and UUID_ARGS in that case.*/
+
+#define SID_FMT "%04x"
+#define SID_ARGS(SID) uuid_prefix(SID, 4)
+
+#define CID_FMT "%04x"
+#define CID_ARGS(CID) uuid_prefix(CID, 4)
+
+struct ovsdb_error *raft_address_validate(const char *address)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_error *raft_address_validate_json(const struct json *address)
+    OVS_WARN_UNUSED_RESULT;
+
+struct ovsdb_error *raft_addresses_from_json(const struct json *,
+                                             struct sset *addresses)
+    OVS_WARN_UNUSED_RESULT;
+struct json *raft_addresses_to_json(const struct sset *addresses);
+
+enum raft_server_phase {
+    RAFT_PHASE_STABLE,          /* Not being changed. */
+
+    /* Phases for servers being added. */
+    RAFT_PHASE_CATCHUP,         /* Populating new server's log. */
+    RAFT_PHASE_CAUGHT_UP,       /* Waiting for prev configuration to commit. */
+    RAFT_PHASE_COMMITTING,      /* Waiting for new configuration to commit. */
+
+    /* Phases for servers to be removed. */
+    RAFT_PHASE_REMOVE,          /* To be removed. */
+};
+
+const char *raft_server_phase_to_string(enum raft_server_phase);
+
+struct raft_server {
+    struct hmap_node hmap_node; /* Hashed based on 'sid'. */
+
+    struct uuid sid;            /* Server ID. */
+    char *address;              /* "(tcp|ssl):1.2.3.4:5678" */
+
+    /* Volatile state on candidates.  Reinitialized at start of election. */
+    struct uuid vote;           /* Server ID of vote, or all-zeros. */
+
+    /* Volatile state on leaders.  Reinitialized after election. */
+    uint64_t next_index;     /* Index of next log entry to send this server. */
+    uint64_t match_index;    /* Index of max log entry server known to have. */
+    enum raft_server_phase phase;
+    /* For use in adding and removing servers: */
+    struct uuid requester_sid;  /* Nonzero if requested via RPC. */
+    struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */
+};
+
+void raft_server_destroy(struct raft_server *);
+void raft_servers_destroy(struct hmap *servers);
+struct raft_server *raft_server_add(struct hmap *servers,
+                                    const struct uuid *sid,
+                                    const char *address);
+struct ovsdb_error *raft_servers_from_json(const struct json *,
+                                           struct hmap *servers)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_error *raft_servers_validate_json(const struct json *);
+    OVS_WARN_UNUSED_RESULT
+struct json *raft_servers_to_json(const struct hmap *servers);
+void raft_servers_format(const struct hmap *servers, struct ds *ds);
+
+struct raft_entry {
+    uint64_t term;
+    struct json *data;
+    struct uuid eid;
+    struct json *servers;
+};
+
+void raft_entry_destroy(struct raft_entry *);
+struct json *raft_entry_to_json(const struct raft_entry *);
+struct ovsdb_error *raft_entry_from_json(struct json *, struct raft_entry *)
+    OVS_WARN_UNUSED_RESULT;
+
+void raft_put_uint64(struct json *object, const char *name, uint64_t integer);
+uint64_t raft_parse_uint64(struct ovsdb_parser *, const char *name);
+
+bool raft_parse_required_boolean(struct ovsdb_parser *, const char *name);
+int raft_parse_optional_boolean(struct ovsdb_parser *, const char *name);
+const char *raft_parse_required_string(struct ovsdb_parser *,
+                                           const char *name);
+const char *raft_parse_optional_string(struct ovsdb_parser *,
+                                           const char *name);
+bool raft_parse_uuid__(struct ovsdb_parser *, const char *name, bool optional,
+                       struct uuid *);
+struct uuid raft_parse_required_uuid(struct ovsdb_parser *, const char *name);
+bool raft_parse_optional_uuid(struct ovsdb_parser *, const char *name,
+                         struct uuid *);
+
+#endif /* raft-private.h */
diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
new file mode 100644
index 000000000000..4b97c2bbde87
--- /dev/null
+++ b/ovsdb/raft-rpc.c
@@ -0,0 +1,788 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "raft-rpc.h"
+#include <stdlib.h>
+#include <string.h>
+#include "compiler.h"
+#include "jsonrpc.h"
+#include "ovsdb-error.h"
+#include "ovsdb-parser.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/json.h"
+#include "openvswitch/vlog.h"
+#include "sset.h"
+
+VLOG_DEFINE_THIS_MODULE(raft_rpc);
+
+#define RAFT_RPC(ENUM, NAME)                                            \
+    static void raft_##NAME##_destroy(struct raft_##NAME *);            \
+    static void raft_##NAME##_to_jsonrpc(const struct raft_##NAME *,    \
+                                         struct json *);                \
+    static void raft_##NAME##_from_jsonrpc(struct ovsdb_parser *,       \
+                                           struct raft_##NAME *);       \
+    static void raft_format_##NAME(const struct raft_##NAME *, struct ds *);
+RAFT_RPC_TYPES
+#undef RAFT_RPC
+
+/* raft_rpc_type. */
+const char *
+raft_rpc_type_to_string(enum raft_rpc_type status)
+{
+    switch (status) {
+#define RAFT_RPC(ENUM, NAME) case ENUM: return #NAME;
+        RAFT_RPC_TYPES
+#undef RAFT_RPC
+            }
+    return "<unknown>";
+}
+
+bool
+raft_rpc_type_from_string(const char *s, enum raft_rpc_type *status)
+{
+#define RAFT_RPC(ENUM, NAME)                    \
+    if (!strcmp(s, #NAME)) {                    \
+        *status = ENUM;                         \
+        return true;                            \
+    }
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+        return false;
+}
+
+/* raft_hello_request. */
+static void
+raft_hello_request_destroy(struct raft_hello_request *rq OVS_UNUSED)
+{
+}
+
+static void
+raft_hello_request_to_jsonrpc(const struct raft_hello_request *rq OVS_UNUSED,
+                              struct json *args OVS_UNUSED)
+{
+}
+
+static void
+raft_hello_request_from_jsonrpc(struct ovsdb_parser *p OVS_UNUSED,
+                                struct raft_hello_request *rq OVS_UNUSED)
+{
+}
+
+static void
+raft_format_hello_request(const struct raft_hello_request *hello OVS_UNUSED,
+                          struct ds *s OVS_UNUSED)
+{
+}
+
+/* raft_append_request. */
+
+static void
+raft_append_request_destroy(struct raft_append_request *rq)
+{
+    for (size_t i = 0; i < rq->n_entries; i++) {
+        json_destroy(rq->entries[i].data);
+    }
+    free(rq->entries);
+}
+
+static void
+raft_append_request_to_jsonrpc(const struct raft_append_request *rq,
+                               struct json *args)
+{
+    raft_put_uint64(args, "term", rq->term);
+    raft_put_uint64(args, "prev_log_index", rq->prev_log_index);
+    raft_put_uint64(args, "prev_log_term", rq->prev_log_term);
+    raft_put_uint64(args, "leader_commit", rq->leader_commit);
+
+    struct json **entries = xmalloc(rq->n_entries * sizeof *entries);
+    for (size_t i = 0; i < rq->n_entries; i++) {
+        entries[i] = raft_entry_to_json(&rq->entries[i]);
+    }
+    json_object_put(args, "log", json_array_create(entries, rq->n_entries));
+}
+
+static void
+raft_append_request_from_jsonrpc(struct ovsdb_parser *p,
+                                 struct raft_append_request *rq)
+{
+    rq->term = raft_parse_uint64(p, "term");
+    rq->prev_log_index = raft_parse_uint64(p, "prev_log_index");
+    rq->prev_log_term = raft_parse_uint64(p, "prev_log_term");
+    rq->leader_commit = raft_parse_uint64(p, "leader_commit");
+
+    const struct json *log = ovsdb_parser_member(p, "log", OP_ARRAY);
+    if (!log) {
+        return;
+    }
+    const struct json_array *entries = json_array(log);
+    rq->entries = xmalloc(entries->n * sizeof *rq->entries);
+    rq->n_entries = 0;
+    for (size_t i = 0; i < entries->n; i++) {
+        struct ovsdb_error *error = raft_entry_from_json(entries->elems[i],
+                                                         &rq->entries[i]);
+        if (error) {
+            ovsdb_parser_put_error(p, error);
+            break;
+        }
+        rq->n_entries++;
+    }
+}
+
+static void
+raft_format_append_request(const struct raft_append_request *rq,
+                           struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rq->term);
+    ds_put_format(s, " prev_log_index=%"PRIu64, rq->prev_log_index);
+    ds_put_format(s, " prev_log_term=%"PRIu64, rq->prev_log_term);
+    ds_put_format(s, " leader_commit=%"PRIu64, rq->leader_commit);
+    ds_put_format(s, " n_entries=%u", rq->n_entries);
+}
+
+/* raft_append_reply. */
+
+const char *
+raft_append_result_to_string(enum raft_append_result result)
+{
+    switch (result) {
+    case RAFT_APPEND_OK:
+        return "OK";
+    case RAFT_APPEND_INCONSISTENCY:
+        return "inconsistency";
+    case RAFT_APPEND_IO_ERROR:
+        return "I/O error";
+    default:
+        return NULL;
+    }
+}
+
+bool
+raft_append_result_from_string(const char *s, enum raft_append_result *resultp)
+{
+    for (enum raft_append_result result = 0; ; result++) {
+        const char *s2 = raft_append_result_to_string(result);
+        if (!s2) {
+            *resultp = 0;
+            return false;
+        } else if (!strcmp(s, s2)) {
+            *resultp = result;
+            return true;
+        }
+    }
+}
+
+static void
+raft_append_reply_destroy(struct raft_append_reply *rpy OVS_UNUSED)
+{
+}
+
+static void
+raft_append_reply_to_jsonrpc(const struct raft_append_reply *rpy,
+                             struct json *args)
+{
+    raft_put_uint64(args, "term", rpy->term);
+    raft_put_uint64(args, "log_end", rpy->log_end);
+    raft_put_uint64(args, "prev_log_index", rpy->prev_log_index);
+    raft_put_uint64(args, "prev_log_term", rpy->prev_log_term);
+    raft_put_uint64(args, "n_entries", rpy->n_entries);
+    json_object_put_string(args, "result",
+                           raft_append_result_to_string(rpy->result));
+}
+
+static void
+raft_append_reply_from_jsonrpc(struct ovsdb_parser *p,
+                               struct raft_append_reply *rpy)
+{
+    rpy->term = raft_parse_uint64(p, "term");
+    rpy->log_end = raft_parse_uint64(p, "log_end");
+    rpy->prev_log_index = raft_parse_uint64(p, "prev_log_index");
+    rpy->prev_log_term = raft_parse_uint64(p, "prev_log_term");
+    rpy->n_entries = raft_parse_uint64(p, "n_entries");
+
+    const char *result = raft_parse_required_string(p, "result");
+    if (result && !raft_append_result_from_string(result, &rpy->result)) {
+        ovsdb_parser_raise_error(p, "unknown result \"%s\"", result);
+    }
+}
+
+static void
+raft_format_append_reply(const struct raft_append_reply *rpy, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rpy->term);
+    ds_put_format(s, " log_end=%"PRIu64, rpy->log_end);
+    ds_put_format(s, " result=\"%s\"",
+                  raft_append_result_to_string(rpy->result));
+}
+
+/* raft_vote_request. */
+
+static void
+raft_vote_request_destroy(struct raft_vote_request *rq OVS_UNUSED)
+{
+}
+
+static void
+raft_vote_request_to_jsonrpc(const struct raft_vote_request *rq,
+                             struct json *args)
+{
+    raft_put_uint64(args, "term", rq->term);
+    raft_put_uint64(args, "last_log_index", rq->last_log_index);
+    raft_put_uint64(args, "last_log_term", rq->last_log_term);
+    if (rq->leadership_transfer) {
+        json_object_put(args, "leadership_transfer",
+                        json_boolean_create(true));
+    }
+}
+
+static void
+raft_vote_request_from_jsonrpc(struct ovsdb_parser *p,
+                               struct raft_vote_request *rq)
+{
+    rq->term = raft_parse_uint64(p, "term");
+    rq->last_log_index = raft_parse_uint64(p, "last_log_index");
+    rq->last_log_term = raft_parse_uint64(p, "last_log_term");
+    rq->leadership_transfer
+        = raft_parse_optional_boolean(p, "leadership_transfer") == 1;
+}
+
+static void
+raft_format_vote_request(const struct raft_vote_request *rq, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rq->term);
+    ds_put_format(s, " last_log_index=%"PRIu64, rq->last_log_index);
+    ds_put_format(s, " last_log_term=%"PRIu64, rq->last_log_term);
+}
+
+/* raft_vote_reply. */
+
+static void
+raft_vote_reply_destroy(struct raft_vote_reply *rpy OVS_UNUSED)
+{
+}
+
+static void
+raft_vote_reply_to_jsonrpc(const struct raft_vote_reply *rpy,
+                           struct json *args)
+{
+    raft_put_uint64(args, "term", rpy->term);
+    json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote));
+}
+
+static void
+raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p,
+                             struct raft_vote_reply *rpy)
+{
+    rpy->term = raft_parse_uint64(p, "term");
+    rpy->vote = raft_parse_required_uuid(p, "vote");
+}
+
+static void
+raft_format_vote_reply(const struct raft_vote_reply *rpy, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rpy->term);
+    ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote));
+}
+
+/* raft_add_server_request */
+
+static void
+raft_add_server_request_destroy(struct raft_add_server_request *rq)
+{
+    free(rq->address);
+}
+
+static void
+raft_add_server_request_to_jsonrpc(const struct raft_add_server_request *rq,
+                                   struct json *args)
+{
+    json_object_put_string(args, "address", rq->address);
+}
+
+static void
+raft_add_server_request_from_jsonrpc(struct ovsdb_parser *p,
+                                     struct raft_add_server_request *rq)
+{
+    rq->address = nullable_xstrdup(raft_parse_required_string(p, "address"));
+}
+
+static void
+raft_format_add_server_request(const struct raft_add_server_request *rq,
+                               struct ds *s)
+{
+    ds_put_format(s, " address=\"%s\"", rq->address);
+}
+
+/* raft_add_server_reply. */
+
+static void
+raft_add_server_reply_destroy(struct raft_add_server_reply *rpy)
+{
+    sset_destroy(&rpy->remote_addresses);
+}
+
+static void
+raft_add_server_reply_to_jsonrpc(const struct raft_add_server_reply *rpy,
+                                 struct json *args)
+{
+    json_object_put(args, "success", json_boolean_create(rpy->success));
+    if (!sset_is_empty(&rpy->remote_addresses)) {
+        json_object_put(args, "remote_addresses",
+                        raft_addresses_to_json(&rpy->remote_addresses));
+    }
+}
+
+static void
+raft_add_server_reply_from_jsonrpc(struct ovsdb_parser *p,
+                                   struct raft_add_server_reply *rpy)
+{
+    rpy->success = raft_parse_required_boolean(p, "success");
+
+    const struct json *json = ovsdb_parser_member(p, "remote_addresses",
+                                                  OP_ARRAY | OP_OPTIONAL);
+    if (json) {
+        ovsdb_parser_put_error(p, raft_addresses_from_json(
+                                   json, &rpy->remote_addresses));
+    } else {
+        sset_init(&rpy->remote_addresses);
+    }
+}
+
+static void
+raft_format_add_server_reply(const struct raft_add_server_reply *rpy,
+                             struct ds *s)
+{
+    ds_put_format(s, " success=%s", rpy->success ? "true" : "false");
+    if (!sset_is_empty(&rpy->remote_addresses)) {
+        ds_put_cstr(s, " remote_addresses=[");
+
+        const char *address;
+        int i = 0;
+        SSET_FOR_EACH (address, &rpy->remote_addresses) {
+            if (i++ > 0) {
+                ds_put_cstr(s, ", ");
+            }
+            ds_put_cstr(s, address);
+        }
+        ds_put_char(s, ']');
+    }
+}
+
+/* raft_remove_server_reply. */
+
+static void
+raft_remove_server_reply_destroy(
+    struct raft_remove_server_reply *rpy OVS_UNUSED)
+{
+}
+
+static void
+raft_remove_server_reply_to_jsonrpc(const struct raft_remove_server_reply *rpy,
+                                    struct json *args)
+{
+    json_object_put(args, "success", json_boolean_create(rpy->success));
+}
+
+static void
+raft_remove_server_reply_from_jsonrpc(struct ovsdb_parser *p,
+                                      struct raft_remove_server_reply *rpy)
+{
+    rpy->success = raft_parse_required_boolean(p, "success");
+}
+
+static void
+raft_format_remove_server_reply(const struct raft_remove_server_reply *rpy,
+                                struct ds *s)
+{
+    ds_put_format(s, " success=%s", rpy->success ? "true" : "false");
+}
+
+/* raft_install_snapshot_request. */
+
+static void
+raft_install_snapshot_request_destroy(
+    struct raft_install_snapshot_request *rq)
+{
+    json_destroy(rq->last_servers);
+    json_destroy(rq->data);
+}
+
+static void
+raft_install_snapshot_request_to_jsonrpc(
+    const struct raft_install_snapshot_request *rq, struct json *args)
+{
+    raft_put_uint64(args, "term", rq->term);
+    raft_put_uint64(args, "last_index", rq->last_index);
+    raft_put_uint64(args, "last_term", rq->last_term);
+    json_object_put(args, "last_servers", json_clone(rq->last_servers));
+    json_object_put_format(args, "last_eid",
+                           UUID_FMT, UUID_ARGS(&rq->last_eid));
+
+    json_object_put(args, "data", json_clone(rq->data));
+}
+
+static void
+raft_install_snapshot_request_from_jsonrpc(
+    struct ovsdb_parser *p, struct raft_install_snapshot_request *rq)
+{
+    rq->last_servers = json_nullable_clone(
+        ovsdb_parser_member(p, "last_servers", OP_OBJECT));
+    ovsdb_parser_put_error(p, raft_servers_validate_json(rq->last_servers));
+
+    rq->term = raft_parse_uint64(p, "term");
+    rq->last_index = raft_parse_uint64(p, "last_index");
+    rq->last_term = raft_parse_uint64(p, "last_term");
+    rq->last_eid = raft_parse_required_uuid(p, "last_eid");
+
+    rq->data = json_nullable_clone(
+        ovsdb_parser_member(p, "data", OP_OBJECT | OP_ARRAY));
+}
+
+static void
+raft_format_install_snapshot_request(
+    const struct raft_install_snapshot_request *rq, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rq->term);
+    ds_put_format(s, " last_index=%"PRIu64, rq->last_index);
+    ds_put_format(s, " last_term=%"PRIu64, rq->last_term);
+    ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid));
+    ds_put_cstr(s, " last_servers=");
+
+    struct hmap servers;
+    struct ovsdb_error *error =
+        raft_servers_from_json(rq->last_servers, &servers);
+    if (!error) {
+        raft_servers_format(&servers, s);
+        raft_servers_destroy(&servers);
+    } else {
+        ds_put_cstr(s, "***error***");
+        ovsdb_error_destroy(error);
+    }
+}
+
+/* raft_install_snapshot_reply. */
+
+static void
+raft_install_snapshot_reply_destroy(
+    struct raft_install_snapshot_reply *rpy OVS_UNUSED)
+{
+}
+
+static void
+raft_install_snapshot_reply_to_jsonrpc(
+    const struct raft_install_snapshot_reply *rpy, struct json *args)
+{
+    raft_put_uint64(args, "term", rpy->term);
+    raft_put_uint64(args, "last_index", rpy->last_index);
+    raft_put_uint64(args, "last_term", rpy->last_term);
+}
+
+static void
+raft_install_snapshot_reply_from_jsonrpc(
+    struct ovsdb_parser *p,
+    struct raft_install_snapshot_reply *rpy)
+{
+    rpy->term = raft_parse_uint64(p, "term");
+    rpy->last_index = raft_parse_uint64(p, "last_index");
+    rpy->last_term = raft_parse_uint64(p, "last_term");
+}
+
+static void
+raft_format_install_snapshot_reply(
+    const struct raft_install_snapshot_reply *rpy, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rpy->term);
+}
+
+/* raft_remove_server_request. */
+
+static void
+raft_remove_server_request_destroy(
+    struct raft_remove_server_request *rq OVS_UNUSED)
+{
+}
+
+static void
+raft_remove_server_request_to_jsonrpc(
+    const struct raft_remove_server_request *rq, struct json *args)
+{
+    json_object_put_format(args, "server_id", SID_FMT, SID_ARGS(&rq->sid));
+}
+
+static void
+raft_remove_server_request_from_jsonrpc(struct ovsdb_parser *p,
+                                        struct raft_remove_server_request *rq)
+{
+    rq->sid = raft_parse_required_uuid(p, "server_id");
+}
+
+static void
+raft_format_remove_server_request(const struct raft_remove_server_request *rq,
+                                  struct ds *s)
+{
+    ds_put_format(s, " server="SID_FMT, SID_ARGS(&rq->sid));
+}
+
+/* raft_become_leader. */
+
+static void
+raft_become_leader_destroy(struct raft_become_leader *rpc OVS_UNUSED)
+{
+}
+
+static void
+raft_become_leader_to_jsonrpc(const struct raft_become_leader *rpc,
+                              struct json *args)
+{
+    raft_put_uint64(args, "term", rpc->term);
+}
+
+static void
+raft_become_leader_from_jsonrpc(struct ovsdb_parser *p,
+                                struct raft_become_leader *rpc)
+{
+    rpc->term = raft_parse_uint64(p, "term");
+}
+
+static void
+raft_format_become_leader(const struct raft_become_leader *rq, struct ds *s)
+{
+    ds_put_format(s, " term=%"PRIu64, rq->term);
+}
+
+/* raft_execute_command_request. */
+
+static void
+raft_execute_command_request_destroy(
+    struct raft_execute_command_request *rq)
+{
+    json_destroy(rq->data);
+}
+
+static void
+raft_execute_command_request_to_jsonrpc(
+    const struct raft_execute_command_request *rq, struct json *args)
+{
+    json_object_put(args, "data", json_clone(rq->data));
+    json_object_put_format(args, "prereq", UUID_FMT, UUID_ARGS(&rq->prereq));
+    json_object_put_format(args, "result", UUID_FMT, UUID_ARGS(&rq->result));
+}
+
+static void
+raft_execute_command_request_from_jsonrpc(
+    struct ovsdb_parser *p, struct raft_execute_command_request *rq)
+{
+    rq->data = json_nullable_clone(ovsdb_parser_member(p, "data",
+                                                       OP_OBJECT | OP_ARRAY));
+    rq->prereq = raft_parse_required_uuid(p, "prereq");
+    rq->result = raft_parse_required_uuid(p, "result");
+}
+
+static void
+raft_format_execute_command_request(
+    const struct raft_execute_command_request *rq, struct ds *s)
+{
+    ds_put_format(s, " prereq="UUID_FMT, UUID_ARGS(&rq->prereq));
+    ds_put_format(s, " result="UUID_FMT, UUID_ARGS(&rq->result));
+    ds_put_format(s, " data=");
+    json_to_ds(rq->data, JSSF_SORT, s);
+}
+
+/* raft_execute_command_reply. */
+
+static void
+raft_execute_command_reply_destroy(
+    struct raft_execute_command_reply *rpy OVS_UNUSED)
+{
+}
+
+static void
+raft_execute_command_reply_to_jsonrpc(
+    const struct raft_execute_command_reply *rpy, struct json *args)
+{
+    json_object_put_format(args, "result", UUID_FMT, UUID_ARGS(&rpy->result));
+    json_object_put_string(args, "status",
+                           raft_command_status_to_string(rpy->status));
+}
+
+static void
+raft_execute_command_reply_from_jsonrpc(
+    struct ovsdb_parser *p, struct raft_execute_command_reply *rpy)
+{
+    rpy->result = raft_parse_required_uuid(p, "result");
+
+    const char *status = raft_parse_required_string(p, "status");
+    if (status && !raft_command_status_from_string(status, &rpy->status)) {
+        ovsdb_parser_raise_error(p, "unknown status \"%s\"", status);
+    }
+}
+
+static void
+raft_format_execute_command_reply(
+    const struct raft_execute_command_reply *rpy, struct ds *s)
+{
+    ds_put_format(s, " result="UUID_FMT, UUID_ARGS(&rpy->result));
+    ds_put_format(s, " status=\"%s\"",
+                  raft_command_status_to_string(rpy->status));
+}
+
+void
+raft_rpc_destroy(union raft_rpc *rpc)
+{
+    if (!rpc) {
+        return;
+    }
+
+    free(rpc->common.comment);
+
+    switch (rpc->common.type) {
+#define RAFT_RPC(ENUM, NAME)                    \
+        case ENUM:                              \
+            raft_##NAME##_destroy(&rpc->NAME);  \
+            break;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+    }
+}
+
+struct jsonrpc_msg *
+raft_rpc_to_jsonrpc(const struct uuid *cid,
+                    const struct uuid *sid,
+                    const union raft_rpc *rpc)
+{
+    struct json *args = json_object_create();
+    if (!uuid_is_zero(cid)) {
+        json_object_put_format(args, "cluster", UUID_FMT, UUID_ARGS(cid));
+    }
+    if (!uuid_is_zero(&rpc->common.sid)) {
+        json_object_put_format(args, "to", UUID_FMT,
+                               UUID_ARGS(&rpc->common.sid));
+    }
+    json_object_put_format(args, "from", UUID_FMT, UUID_ARGS(sid));
+    if (rpc->common.comment) {
+        json_object_put_string(args, "comment", rpc->common.comment);
+    }
+
+    switch (rpc->common.type) {
+#define RAFT_RPC(ENUM, NAME)                        \
+    case ENUM:                                      \
+        raft_##NAME##_to_jsonrpc(&rpc->NAME, args); \
+        break;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    return jsonrpc_create_notify(raft_rpc_type_to_string(rpc->common.type),
+                                 json_array_create_1(args));
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_rpc_from_jsonrpc(struct uuid *cidp,
+                      const struct uuid *sid,
+                      const struct jsonrpc_msg *msg,
+                      union raft_rpc *rpc)
+{
+    memset(rpc, 0, sizeof *rpc);
+    if (msg->type != JSONRPC_NOTIFY) {
+        return ovsdb_error(NULL, "expecting notify RPC but received %s",
+                           jsonrpc_msg_type_to_string(msg->type));
+    }
+
+    if (!raft_rpc_type_from_string(msg->method, &rpc->common.type)) {
+        return ovsdb_error(NULL, "unknown method %s", msg->method);
+    }
+
+    if (json_array(msg->params)->n != 1) {
+        return ovsdb_error(NULL,
+                           "%s RPC has %"PRIuSIZE" parameters (expected 1)",
+                           msg->method, json_array(msg->params)->n);
+    }
+
+    struct ovsdb_parser p;
+    ovsdb_parser_init(&p, json_array(msg->params)->elems[0],
+                      "raft %s RPC", msg->method);
+
+    bool is_hello = rpc->common.type == RAFT_RPC_HELLO_REQUEST;
+    bool is_add = rpc->common.type == RAFT_RPC_ADD_SERVER_REQUEST;
+
+    struct uuid cid;
+    if (raft_parse_uuid__(&p, "cluster", is_add, &cid)
+        && !uuid_equals(&cid, cidp)) {
+        if (uuid_is_zero(cidp)) {
+            *cidp = cid;
+            VLOG_INFO("learned cluster ID "CID_FMT, CID_ARGS(&cid));
+        } else {
+            ovsdb_parser_raise_error(&p, "wrong cluster "CID_FMT" "
+                                     "(expected "CID_FMT")",
+                                     CID_ARGS(&cid), CID_ARGS(cidp));
+        }
+    }
+
+    struct uuid to_sid;
+    if (raft_parse_uuid__(&p, "to", is_add || is_hello, &to_sid)
+        && !uuid_equals(&to_sid, sid)) {
+        ovsdb_parser_raise_error(&p, "misrouted message (addressed to "
+                                 SID_FMT" but we're "SID_FMT")",
+                                 SID_ARGS(&to_sid), SID_ARGS(sid));
+    }
+
+    rpc->common.sid = raft_parse_required_uuid(&p, "from");
+    rpc->common.comment = nullable_xstrdup(
+        raft_parse_optional_string(&p, "comment"));
+
+    switch (rpc->common.type) {
+#define RAFT_RPC(ENUM, NAME)                            \
+        case ENUM:                                      \
+            raft_##NAME##_from_jsonrpc(&p, &rpc->NAME); \
+            break;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    struct ovsdb_error *error = ovsdb_parser_finish(&p);
+    if (error) {
+        raft_rpc_destroy(rpc);
+    }
+    return error;
+}
+
+void
+raft_rpc_format(const union raft_rpc *rpc, struct ds *s)
+{
+    ds_put_format(s, SID_FMT" %s", SID_ARGS(&rpc->common.sid),
+                  raft_rpc_type_to_string(rpc->common.type));
+    if (rpc->common.comment) {
+        ds_put_format(s, " \"%s\"", rpc->common.comment);
+    }
+    ds_put_char(s, ':');
+
+    switch (rpc->common.type) {
+#define RAFT_RPC(ENUM, NAME)                    \
+    case ENUM:                                  \
+        raft_format_##NAME(&rpc->NAME, s);      \
+        break;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+    default:
+        OVS_NOT_REACHED();
+    }
+}
diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h
new file mode 100644
index 000000000000..a3ba0c8495d0
--- /dev/null
+++ b/ovsdb/raft-rpc.h
@@ -0,0 +1,271 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RAFT_RPC_H
+#define RAFT_RPC_H 1
+
+/* Data structures used internally by Raft implementation for JSON-RPC. */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "openvswitch/uuid.h"
+#include "raft.h"
+#include "raft-private.h"
+#include "sset.h"
+
+struct ds;
+
+#define RAFT_RPC_TYPES                                                  \
+    /* Hello RPC. */                                                    \
+    RAFT_RPC(RAFT_RPC_HELLO_REQUEST, hello_request)                     \
+                                                                        \
+    /* AppendEntries RPC. */                                            \
+    RAFT_RPC(RAFT_RPC_APPEND_REQUEST, append_request)                   \
+    RAFT_RPC(RAFT_RPC_APPEND_REPLY, append_reply)                       \
+                                                                        \
+    /* RequestVote RPC. */                                              \
+    RAFT_RPC(RAFT_RPC_VOTE_REQUEST, vote_request)                       \
+    RAFT_RPC(RAFT_RPC_VOTE_REPLY, vote_reply)                           \
+                                                                        \
+    /* AddServer RPC. */                                                \
+    RAFT_RPC(RAFT_RPC_ADD_SERVER_REQUEST, add_server_request)           \
+    RAFT_RPC(RAFT_RPC_ADD_SERVER_REPLY, add_server_reply)               \
+                                                                        \
+    /* RemoveServer RPC. */                                             \
+    RAFT_RPC(RAFT_RPC_REMOVE_SERVER_REQUEST, remove_server_request)     \
+    RAFT_RPC(RAFT_RPC_REMOVE_SERVER_REPLY, remove_server_reply)         \
+                                                                        \
+    /* InstallSnapshot RPC. */                                          \
+    RAFT_RPC(RAFT_RPC_INSTALL_SNAPSHOT_REQUEST, install_snapshot_request) \
+    RAFT_RPC(RAFT_RPC_INSTALL_SNAPSHOT_REPLY, install_snapshot_reply)   \
+                                                                        \
+    /* BecomeLeader RPC. */                                             \
+    RAFT_RPC(RAFT_RPC_BECOME_LEADER, become_leader)                     \
+                                                                        \
+    /* ExecuteCommand RPC. */                                           \
+    RAFT_RPC(RAFT_RPC_EXECUTE_COMMAND_REQUEST, execute_command_request) \
+    RAFT_RPC(RAFT_RPC_EXECUTE_COMMAND_REPLY, execute_command_reply)
+
+enum raft_rpc_type {
+#define RAFT_RPC(ENUM, NAME) ENUM,
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+};
+
+const char *raft_rpc_type_to_string(enum raft_rpc_type);
+bool raft_rpc_type_from_string(const char *, enum raft_rpc_type *);
+
+struct raft_rpc_common {
+    enum raft_rpc_type type;    /* One of RAFT_RPC_*. */
+    struct uuid sid;            /* SID of peer server. */
+    char *comment;
+};
+
+struct raft_hello_request {
+    struct raft_rpc_common common;
+};
+
+struct raft_append_request {
+    struct raft_rpc_common common;
+    uint64_t term;              /* Leader's term. */
+    uint64_t prev_log_index;    /* Log entry just before new ones. */
+    uint64_t prev_log_term;     /* Term of prev_log_index entry. */
+    uint64_t leader_commit;     /* Leader's commit_index. */
+
+    /* The append request includes 0 or more log entries.  entries[0] is for
+     * log entry 'prev_log_index + 1', and so on.
+     *
+     * A heartbeat append_request has no terms. */
+    struct raft_entry *entries;
+    unsigned int n_entries;
+};
+
+enum raft_append_result {
+    RAFT_APPEND_OK,             /* Success. */
+    RAFT_APPEND_INCONSISTENCY,  /* Failure due to log inconsistency. */
+    RAFT_APPEND_IO_ERROR,       /* Failure due to I/O error. */
+};
+
+const char *raft_append_result_to_string(enum raft_append_result);
+bool raft_append_result_from_string(const char *, enum raft_append_result *);
+
+struct raft_append_reply {
+    struct raft_rpc_common common;
+
+    /* Copied from the state machine of the reply's sender. */
+    uint64_t term;             /* Current term, for leader to update itself. */
+    uint64_t log_end;          /* To allow capping next_index, see 4.2.1. */
+
+    /* Copied from request. */
+    uint64_t prev_log_index;   /* Log entry just before new ones. */
+    uint64_t prev_log_term;    /* Term of prev_log_index entry. */
+    unsigned int n_entries;
+
+    /* Result. */
+    enum raft_append_result result;
+};
+
+struct raft_vote_request {
+    struct raft_rpc_common common;
+    uint64_t term;           /* Candidate's term. */
+    uint64_t last_log_index; /* Index of candidate's last log entry. */
+    uint64_t last_log_term;  /* Term of candidate's last log entry. */
+    bool leadership_transfer;  /* True to override minimum election timeout. */
+};
+
+struct raft_vote_reply {
+    struct raft_rpc_common common;
+    uint64_t term;          /* Current term, for candidate to update itself. */
+    struct uuid vote;       /* Server ID of vote. */
+};
+
+struct raft_add_server_request {
+    struct raft_rpc_common common;
+    char *address;              /* Address of new server. */
+};
+
+struct raft_remove_server_request {
+    struct raft_rpc_common common;
+    struct uuid sid;            /* Server to remove. */
+
+    /* Nonnull if request was received via unixctl. */
+    struct unixctl_conn *requester_conn;
+};
+
+/* The operation could not be initiated because this server is not the current
+ * leader.  Only the leader can add or remove servers. */
+#define RAFT_SERVER_NOT_LEADER "not leader"
+
+/* An operation to add a server succeeded without any change because the server
+ * was already part of the cluster. */
+#define RAFT_SERVER_ALREADY_PRESENT "already in cluster"
+
+/* An operation to remove a server succeeded without any change because the
+ * server was not part of the cluster. */
+#define RAFT_SERVER_ALREADY_GONE "already not in cluster"
+
+/* The operation could not be initiated because an identical
+ * operation was already in progress. */
+#define RAFT_SERVER_IN_PROGRESS "in progress"
+
+/* Adding a server failed because of a timeout.  This could mean that the
+ * server was entirely unreachable, or that it became unreachable partway
+ * through populating it with an initial copy of the log.  In the latter case,
+ * retrying the operation should resume where it left off. */
+#define RAFT_SERVER_TIMEOUT "timeout"
+
+/* The operation was initiated but it later failed because this server lost
+ * cluster leadership.  The operation may be retried against the new cluster
+ * leader.  For adding a server, if the log was already partially copied to the
+ * new server, retrying the operation should resume where it left off. */
+#define RAFT_SERVER_LOST_LEADERSHIP "lost leadership"
+
+/* Adding a server was canceled by submission of an operation to remove the
+ * same server, or removing a server was canceled by submission of an operation
+ * to add the same server. */
+#define RAFT_SERVER_CANCELED "canceled"
+
+/* Adding or removing a server could not be initiated because the operation to
+ * remove or add the server, respectively, has been logged but not committed.
+ * The new operation may be retried once the former operation commits. */
+#define RAFT_SERVER_COMMITTING "committing"
+
+/* Adding or removing a server was canceled because the leader shut down. */
+#define RAFT_SERVER_SHUTDOWN "shutdown"
+
+/* Removing a server could not be initiated because, taken together with any
+ * other scheduled server removals, the cluster would be empty.  (This
+ * calculation ignores scheduled or uncommitted add server operations because
+ * of the possibility that they could fail.)  */
+#define RAFT_SERVER_EMPTY "empty"
+
+struct raft_add_server_reply {
+    struct raft_rpc_common common;
+    bool success;
+    struct sset remote_addresses;
+};
+
+struct raft_remove_server_reply {
+    struct raft_rpc_common common;
+    bool success;
+};
+
+struct raft_install_snapshot_request {
+    struct raft_rpc_common common;
+
+    uint64_t term;              /* Leader's term. */
+
+    uint64_t last_index;        /* Replaces everything up to this index. */
+    uint64_t last_term;         /* Term of last_index. */
+    struct uuid last_eid;       /* Last entry ID. */
+    struct json *last_servers;
+
+    /* Data. */
+    struct json *data;
+};
+
+struct raft_install_snapshot_reply {
+    struct raft_rpc_common common;
+
+    uint64_t term;              /* For leader to update itself. */
+
+    /* Repeated from the install_snapshot request. */
+    uint64_t last_index;
+    uint64_t last_term;
+};
+
+struct raft_become_leader {
+    struct raft_rpc_common common;
+
+    uint64_t term;              /* Leader's term. */
+};
+
+struct raft_execute_command_request {
+    struct raft_rpc_common common;
+
+    struct json *data;
+    struct uuid prereq;
+    struct uuid result;
+};
+
+struct raft_execute_command_reply {
+    struct raft_rpc_common common;
+
+    struct uuid result;
+    enum raft_command_status status;
+};
+
+union raft_rpc {
+    struct raft_rpc_common common;
+#define RAFT_RPC(ENUM, NAME) struct raft_##NAME NAME;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+};
+
+void raft_rpc_destroy(union raft_rpc *);
+
+struct jsonrpc_msg *raft_rpc_to_jsonrpc(const struct uuid *cid,
+                                        const struct uuid *sid,
+                                        const union raft_rpc *);
+struct ovsdb_error *raft_rpc_from_jsonrpc(struct uuid *cid,
+                                          const struct uuid *sid,
+                                          const struct jsonrpc_msg *,
+                                          union raft_rpc *)
+    OVS_WARN_UNUSED_RESULT;
+
+void raft_rpc_format(const union raft_rpc *, struct ds *);
+
+#endif /* lib/raft-rpc.h */
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
new file mode 100644
index 000000000000..58bd7b001a63
--- /dev/null
+++ b/ovsdb/raft.c
@@ -0,0 +1,4105 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "raft.h"
+#include "raft-private.h"
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "hash.h"
+#include "jsonrpc.h"
+#include "lockfile.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/json.h"
+#include "openvswitch/list.h"
+#include "openvswitch/vlog.h"
+#include "ovs-rcu.h"
+#include "ovs-thread.h"
+#include "ovsdb-error.h"
+#include "ovsdb-parser.h"
+#include "ovsdb/log.h"
+#include "poll-loop.h"
+#include "raft-rpc.h"
+#include "random.h"
+#include "seq.h"
+#include "socket-util.h"
+#include "stream.h"
+#include "timeval.h"
+#include "unicode.h"
+#include "unixctl.h"
+#include "util.h"
+#include "uuid.h"
+
+VLOG_DEFINE_THIS_MODULE(raft);
+
+static void raft_run_reconfigure(struct raft *);
+
+struct raft;
+
+enum raft_role {
+    RAFT_FOLLOWER,
+    RAFT_CANDIDATE,
+    RAFT_LEADER
+};
+
+enum raft_timer {
+    RAFT_FAST,
+    RAFT_SLOW
+};
+
+static void raft_send_append_reply(struct raft *,
+                                   const struct raft_append_request *,
+                                   enum raft_append_result,
+                                   const char *comment);
+static void raft_update_match_index(struct raft *, struct raft_server *,
+                                    uint64_t min_index);
+
+static void raft_send_remove_server_reply__(
+    struct raft *, const struct uuid *target_sid,
+    const struct uuid *requester_sid, struct unixctl_conn *requester_conn,
+    bool success, const char *comment);
+
+static void raft_send_vote_reply(struct raft *, const struct uuid *dst,
+                                 const struct uuid *vote);
+
+static void raft_set_servers(struct raft *, const struct hmap *new_servers,
+                             enum vlog_level);
+static void raft_server_init_leader(struct raft *, struct raft_server *);
+
+struct raft_conn {
+    struct ovs_list list_node;
+    struct jsonrpc_session *js;
+    struct uuid sid;
+    bool incoming;              /* True if incoming, false if outgoing. */
+
+    /* Join. */
+    unsigned int js_seqno;
+};
+
+struct raft_command {
+    struct hmap_node hmap_node; /* In struct raft's 'commands' hmap. */
+    uint64_t index;             /* Index in log. */
+
+    unsigned int n_refs;
+    enum raft_command_status status;
+    struct uuid eid;
+    long long int timestamp;
+    struct uuid sid;
+};
+
+static void raft_command_complete(struct raft *, struct raft_command *,
+                                  enum raft_command_status);
+
+static void raft_complete_all_commands(struct raft *,
+                                       enum raft_command_status);
+static struct raft_command *raft_find_command_by_index(struct raft *,
+                                                       uint64_t index);
+static struct raft_command *raft_find_command_by_eid(struct raft *,
+                                                     const struct uuid *);
+
+enum raft_waiter_type {
+    RAFT_W_COMMAND,
+    RAFT_W_APPEND,
+    RAFT_W_VOTE
+};
+
+struct raft_waiter {
+    struct ovs_list list_node;
+    uint64_t fsync_seqno;
+    enum raft_waiter_type type;
+    union {
+        /* RAFT_W_COMMAND. */
+        struct {
+            struct raft_command *cmd;
+            uint64_t index;
+        } command;
+
+        /* RAFT_W_APPEND. */
+        struct {
+            struct raft_append_request *rq; /* Does not include 'entries'. */
+        } append;
+    };
+};
+
+static struct raft_waiter *raft_waiter_create(struct raft *,
+                                              enum raft_waiter_type);
+
+/* The Raft state machine. */
+struct raft {
+    struct hmap_node hmap_node; /* In 'all_rafts'. */
+    struct ovsdb_log *log;
+
+/* Persistent derived state.
+ *
+ * This must be updated on stable storage before responding to RPCs, but it can
+ * be derived from the header, snapshot, and log in 'log'. */
+
+    struct uuid cid;            /* Cluster ID (immutable for the cluster). */
+    struct uuid sid;            /* Server ID (immutable for the server). */
+    char *local_address;        /* Local address (immutable for the server). */
+    char *name;                 /* Cluster name (immutable for the cluster). */
+
+    struct hmap servers;        /* Contains "struct raft_server"s. */
+    struct raft_server *me;     /* This server (points into 'servers'). */
+
+/* Persistent state on all servers.
+ *
+ * Must be updated on stable storage before responding to RPCs. */
+
+    uint64_t current_term;      /* Initialized to 0 and only increases. */
+    struct uuid voted_for;      /* In current term, or all-zeros if none. */
+
+    /* The log.
+     *
+     * A log entry with index 1 never really exists; the initial snapshot for a
+     * Raft is considered to include this index.  The first real log entry has
+     * index 2.
+     *
+     * XXX should we start at a slightly higher index to make unsigned
+     * arithmetic safer? or use signed arithmetic?
+     *
+     * A new Raft instance contains an empty log:  log_start=2, log_end=2.
+     * Over time, the log grows:                   log_start=2, log_end=N.
+     * At some point, the server takes a snapshot: log_start=N, log_end=N.
+     * The log continues to grow:                  log_start=N, log_end=N+1...
+     *
+     * Must be updated on stable storage before responding to RPCs. */
+    struct raft_entry *entries; /* Log entry i is in log[i - log_start]. */
+    uint64_t log_start;         /* Index of first entry in log. */
+    uint64_t log_end;           /* Index of last entry in log, plus 1. */
+    size_t allocated_log;       /* Allocated entries in 'log'. */
+
+    /* Snapshot state (see Figure 5.1)
+     *
+     * This is the state of the cluster as of the last discarded log entry,
+     * that is, at log index 'log_start - 1' (called prevIndex in Figure 5.1).
+     * Only committed log entries can be included in a snapshot.
+     *
+     * XXX would be a little cleaner in a few places for this to be log[-1]. */
+    struct raft_entry snap;
+
+/* Volatile state.
+ *
+ * The snapshot is always committed, but the rest of the log might not be yet.
+ * 'last_applied' tracks what entries have been passed to the client.  If the
+ * client hasn't yet read the latest snapshot, then even the snapshot isn't
+ * applied yet.  Thus, the invariants are different for these members:
+ *
+ *     log_start - 2 <= last_applied <= commit_index < log_end.
+ *     log_start - 1                 <= commit_index < log_end.
+ */
+
+    enum raft_role role;        /* Current role. */
+    uint64_t commit_index;      /* Max log index known to be committed. */
+    uint64_t last_applied;      /* Max log index applied to state machine. */
+    struct uuid leader_sid;     /* Server ID of leader (zero, if unknown). */
+
+#define ELECTION_BASE_MSEC 1024
+#define ELECTION_RANGE_MSEC 1024
+    long long int election_base;
+    long long int election_timeout;
+
+#define PING_TIME_MSEC (ELECTION_BASE_MSEC / 3)
+    long long int ping_timeout;
+
+    /* Used for joining a cluster. */
+    bool joining;                 /* Attempting to join the cluster? */
+    struct sset remote_addresses;
+    long long int join_timeout;
+
+    /* Used for leaving a cluster. */
+    bool leaving;
+    bool left;
+    long long int leave_timeout;
+
+    /* File synchronization. */
+    bool fsync_thread_running;
+    pthread_t fsync_thread;
+    struct ovs_mutex fsync_mutex;
+    uint64_t fsync_next OVS_GUARDED;
+    uint64_t fsync_cur OVS_GUARDED;
+    struct seq *fsync_request;
+    struct seq *fsync_complete;
+    struct ovs_list waiters;
+
+    /* Network connections. */
+    struct pstream *listener;
+    long long int listen_backoff;
+    struct ovs_list conns;
+
+    /* Leaders only.  Reinitialized after becoming leader. */
+    struct hmap add_servers;    /* Contains "struct raft_server"s to add. */
+    struct raft_server *remove_server; /* Server being removed. */
+    struct hmap commands;              /* Contains "struct raft_command"s. */
+
+    /* Candidates only.  Reinitialized at start of election. */
+    int n_votes;                /* Number of votes for me. */
+};
+
+static struct hmap all_rafts = HMAP_INITIALIZER(&all_rafts);
+
+static void raft_init(void);
+
+static struct ovsdb_error *raft_read_header(struct raft *)
+    OVS_WARN_UNUSED_RESULT;
+
+static void *
+raft_fsync_thread(void *raft_)
+{
+    struct raft *raft = raft_;
+    for (;;) {
+        ovsrcu_quiesce_start();
+
+        uint64_t request_seq = seq_read(raft->fsync_request);
+
+        ovs_mutex_lock(&raft->fsync_mutex);
+        uint64_t next = raft->fsync_next;
+        uint64_t cur = raft->fsync_cur;
+        ovs_mutex_unlock(&raft->fsync_mutex);
+
+        if (next == UINT64_MAX) {
+            break;
+        }
+
+        if (cur != next) {
+            /* XXX following has really questionable thread-safety. */
+            struct ovsdb_error *error = ovsdb_log_commit(raft->log);
+            if (!error) {
+                ovs_mutex_lock(&raft->fsync_mutex);
+                raft->fsync_cur = next;
+                ovs_mutex_unlock(&raft->fsync_mutex);
+
+                seq_change(raft->fsync_complete);
+            } else {
+                char *error_string = ovsdb_error_to_string_free(error);
+                VLOG_WARN("%s", error_string);
+                free(error_string);
+            }
+        }
+
+        seq_wait(raft->fsync_request, request_seq);
+        poll_block();
+    }
+    return NULL;
+}
+
+
+static void raft_send_execute_command_reply(struct raft *,
+                                            const struct uuid *sid,
+                                            const struct uuid *eid,
+                                            enum raft_command_status);
+
+static bool raft_rpc_is_heartbeat(const union raft_rpc *);
+
+static void raft_handle_rpc(struct raft *, const union raft_rpc *);
+static bool raft_send(struct raft *, const union raft_rpc *);
+static bool raft_send__(struct raft *, const union raft_rpc *,
+                        struct jsonrpc_session *);
+static void raft_send_append_request(struct raft *,
+                                     struct raft_server *, unsigned int n,
+                                     const char *comment);
+static bool raft_receive_rpc(struct raft *, struct jsonrpc_session *,
+                             struct uuid *sid, union raft_rpc *);
+static void raft_run_session(struct raft *, struct jsonrpc_session *,
+                             unsigned int *seqno, struct uuid *sid);
+static void raft_wait_session(struct jsonrpc_session *);
+
+static void raft_become_leader(struct raft *);
+static void raft_become_follower(struct raft *);
+static void raft_reset_timer(struct raft *);
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_write_snapshot(struct raft *, struct ovsdb_log *,
+                    uint64_t new_log_start, const struct json *snapshot);
+static void raft_send_heartbeats(struct raft *);
+static void raft_start_election(struct raft *, bool leadership_transfer);
+static bool raft_truncate(struct raft *, uint64_t new_end);
+static void raft_get_servers_from_log(struct raft *);
+
+static void raft_consider_updating_commit_index(struct raft *);
+
+static struct raft_server *
+raft_find_server__(const struct hmap *servers, const struct uuid *sid)
+{
+    struct raft_server *s;
+    HMAP_FOR_EACH_IN_BUCKET (s, hmap_node, uuid_hash(sid), servers) {
+        if (uuid_equals(sid, &s->sid)) {
+            return s;
+        }
+    }
+    return NULL;
+}
+
+static struct raft_server *
+raft_find_server(const struct raft *raft, const struct uuid *sid)
+{
+    return raft_find_server__(&raft->servers, sid);
+}
+
+static char *
+raft_make_address_passive(const char *address_)
+{
+    if (!strncmp(address_, "unix:", 5)) {
+        return xasprintf("p%s", address_);
+    } else {
+        char *address = xstrdup(address_);
+        char *p = strchr(address, ':') + 1;
+        char *host = inet_parse_token(&p);
+        char *port = inet_parse_token(&p);
+
+        struct ds paddr = DS_EMPTY_INITIALIZER;
+        ds_put_format(&paddr, "p%.3s:%s:", address, port);
+        if (strchr(host, ':')) {
+            ds_put_format(&paddr, "[%s]", host);
+        } else {
+            ds_put_cstr(&paddr, host);
+        }
+        free(address);
+        return ds_steal_cstr(&paddr);
+    }
+}
+
+static struct raft *
+raft_alloc(void)
+{
+    raft_init();
+
+    struct raft *raft = xzalloc(sizeof *raft);
+    hmap_node_nullify(&raft->hmap_node);
+    hmap_init(&raft->servers);
+    raft->log_start = raft->log_end = 1;
+    raft->role = RAFT_FOLLOWER;
+    sset_init(&raft->remote_addresses);
+    raft->join_timeout = LLONG_MAX;
+    ovs_mutex_init(&raft->fsync_mutex);
+    raft->fsync_request = seq_create();
+    raft->fsync_complete = seq_create();
+    ovs_list_init(&raft->waiters);
+    raft->listen_backoff = LLONG_MIN;
+    ovs_list_init(&raft->conns);
+    hmap_init(&raft->add_servers);
+    hmap_init(&raft->commands);
+
+    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
+    raft_reset_timer(raft);
+    
+    return raft;
+}
+
+/* Creates an on-disk file thar represents a new Raft cluster and initializes
+ * it to consist of a single server, the one on which this function is called.
+ *
+ * Creates the local copy of the cluster's log in 'file_name', which must not
+ * already exist.  Gives it the name 'name', which should be the database
+ * schema name and which is used only to match up this database with server
+ * added to the cluster later if the cluster ID is unavailable.
+ *
+ * The new server is located at 'local_address', which must take one of the
+ * forms "tcp:IP[:PORT]" or "ssl:IP[:PORT]", where IP is an IPv4 address or a
+ * square bracket enclosed IPv6 address.  PORT, if present, is a port number
+ * that defaults to RAFT_PORT.
+ *
+ * This only creates the on-disk file.  Use raft_open() to start operating the
+ * new server.
+ *
+ * Returns null if successful, otherwise an ovsdb_error describing the
+ * problem. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_create_cluster(const char *file_name, const char *name,
+                    const char *local_address, const struct json *data)
+{
+    /* Parse and verify validity of the local address. */
+    struct ovsdb_error *error = raft_address_validate(local_address);
+    if (error) {
+        return error;
+    }
+
+    /* Create log file. */
+    struct ovsdb_log *log;
+    error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL,
+                           -1, &log);
+    if (error) {
+        return error;
+    }
+
+    /* Write log file. */
+    struct uuid sid, cid, eid;
+    uuid_generate(&sid);
+    uuid_generate(&cid);
+    uuid_generate(&eid);
+
+    char sid_s[UUID_LEN + 1];
+    sprintf(sid_s, UUID_FMT, UUID_ARGS(&sid));
+
+    struct json *snapshot = json_object_create();
+    json_object_put_string(snapshot, "server_id", sid_s);
+    json_object_put_string(snapshot, "local_address", local_address);
+    json_object_put_string(snapshot, "name", name);
+
+    struct json *prev_servers = json_object_create();
+    json_object_put_string(prev_servers, sid_s, local_address);
+    json_object_put(snapshot, "prev_servers", prev_servers);
+
+    json_object_put_format(snapshot, "cluster_id", UUID_FMT, UUID_ARGS(&cid));
+    json_object_put(snapshot, "prev_term", json_integer_create(0));
+    json_object_put(snapshot, "prev_index", json_integer_create(1));
+    json_object_put(snapshot, "prev_data", json_clone(data));
+    json_object_put_format(snapshot, "prev_eid", UUID_FMT, UUID_ARGS(&eid));
+
+    error = ovsdb_log_write(log, snapshot);
+    json_destroy(snapshot);
+    if (!error) {
+        error = ovsdb_log_commit(log);
+    }
+    ovsdb_log_close(log);
+
+    return error;
+}
+
+/* Creates a database file that represents a new server in an existing Raft
+ * cluster.
+ *
+ * Creates the local copy of the cluster's log in 'file_name', which must not
+ * already exist.  Gives it the name 'name', which must be the same name
+ * passed in to raft_create_cluster() earlier.
+ *
+ * 'cid' is optional.  If specified, the new server will join only the cluster
+ * with the given cluster ID.
+ *
+ * The new server is located at 'local_address', which must take one of the
+ * forms "tcp:IP[:PORT]" or "ssl:IP[:PORT]", where IP is an IPv4 address or a
+ * square bracket enclosed IPv6 address.  PORT, if present, is a port number
+ * that defaults to RAFT_PORT.
+ *
+ * Joining the cluster requiring contacting it.  Thus, 'remote_addresses'
+ * specifies the addresses of existing servers in the cluster.  One server out
+ * of the existing cluster is sufficient, as long as that server is reachable
+ * and not partitioned from the current cluster leader.  If multiple servers
+ * from the cluster are specified, then it is sufficient for any of them to
+ * meet this criterion.
+ *
+ * This only creates the on-disk file and does no network access.  Use
+ * raft_open() to start operating the new server.  (Until this happens, the
+ * new server has not joined the cluster.)
+ *
+ * Returns null if successful, otherwise an ovsdb_error describing the
+ * problem. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_join_cluster(const char *file_name,
+                  const char *name, const char *local_address,
+                  const struct sset *remote_addresses,
+                  const struct uuid *cid)
+{
+    ovs_assert(!sset_is_empty(remote_addresses));
+
+    /* Parse and verify validity of the addresses. */
+    struct ovsdb_error *error = raft_address_validate(local_address);
+    if (error) {
+        return error;
+    }
+    const char *addr;
+    SSET_FOR_EACH (addr, remote_addresses) {
+        error = raft_address_validate(addr);
+        if (error) {
+            return error;
+        }
+        if (!strcmp(addr, local_address)) {
+            return ovsdb_error(NULL, "remote addresses cannot be the same "
+                               "as the local address");
+        }
+    }
+
+    /* Verify validity of the cluster ID (if provided). */
+    if (cid && uuid_is_zero(cid)) {
+        return ovsdb_error(NULL, "all-zero UUID is not valid cluster ID");
+    }
+
+    /* Create log file. */
+    struct ovsdb_log *log;
+    error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL,
+                           -1, &log);
+    if (error) {
+        return error;
+    }
+
+    /* Write log file. */
+    struct uuid sid;
+    uuid_generate(&sid);
+
+    struct json *snapshot = json_object_create();
+    json_object_put_format(snapshot, "server_id", UUID_FMT, UUID_ARGS(&sid));
+    json_object_put_string(snapshot, "local_address", local_address);
+    json_object_put_string(snapshot, "name", name);
+    json_object_put(snapshot, "remote_addresses",
+                    raft_addresses_to_json(remote_addresses));
+    if (cid) {
+        json_object_put_format(snapshot, "cluster_id",
+                               UUID_FMT, UUID_ARGS(cid));
+    }
+
+    error = ovsdb_log_write(log, snapshot);
+    json_destroy(snapshot);
+    if (!error) {
+        error = ovsdb_log_commit(log);
+    }
+    ovsdb_log_close(log);
+
+    return error;
+}
+
+struct ovsdb_error *
+raft_read_metadata(const char *file_name, struct raft_metadata *md)
+{
+    struct raft *raft = raft_alloc();
+    struct ovsdb_error *error = ovsdb_log_open(file_name, RAFT_MAGIC,
+                                               OVSDB_LOG_READ_ONLY, -1,
+                                               &raft->log);
+    if (error) {
+        goto exit;
+    }
+
+    error = raft_read_header(raft);
+    if (error) {
+        goto exit;
+    }
+
+    md->sid = raft->sid;
+    md->name = xstrdup(raft->name);
+    md->local = xstrdup(raft->local_address);
+    md->cid = raft->cid;
+
+exit:
+    if (error) {
+        memset(md, 0, sizeof *md);
+    }
+    raft_close(raft);
+    return error;
+}
+
+void
+raft_metadata_destroy(struct raft_metadata *md)
+{
+    if (md) {
+        free(md->name);
+        free(md->local);
+    }
+}
+
+static struct json *
+raft_entry_to_json_with_index(const struct raft *raft, uint64_t index)
+{
+    ovs_assert(index >= raft->log_start && index < raft->log_end);
+    struct json *json = raft_entry_to_json(&raft->entries[index
+                                                          - raft->log_start]);
+    raft_put_uint64(json, "index", index);
+    return json;
+}
+
+static const struct raft_entry *
+raft_get_entry(const struct raft *raft, uint64_t index)
+{
+    ovs_assert(index >= raft->log_start);
+    ovs_assert(index < raft->log_end);
+    return &raft->entries[index - raft->log_start];
+}
+
+static uint64_t
+raft_get_term(const struct raft *raft, uint64_t index)
+{
+    return (index == raft->log_start - 1
+            ? raft->snap.term
+            : raft_get_entry(raft, index)->term);
+}
+
+static const struct uuid *
+raft_get_eid(const struct raft *raft, uint64_t index)
+{
+    return (index == raft->log_start - 1
+            ? &raft->snap.eid
+            : &raft_get_entry(raft, index)->eid);
+}
+
+static struct json *
+raft_servers_for_index(const struct raft *raft, uint64_t index)
+{
+    ovs_assert(index >= raft->log_start - 1);
+    ovs_assert(index < raft->log_end);
+
+    const struct json *servers = raft->snap.servers;
+    for (uint64_t i = raft->log_start; i <= index; i++) {
+        const struct raft_entry *e = raft_get_entry(raft, i);
+        if (e->servers) {
+            servers = e->servers;
+        }
+    }
+    return json_clone(servers);
+}
+
+static void
+raft_set_servers(struct raft *raft, const struct hmap *new_servers,
+                 enum vlog_level level)
+{
+    struct raft_server *s, *next;
+    HMAP_FOR_EACH_SAFE (s, next, hmap_node, &raft->servers) {
+        if (!raft_find_server__(new_servers, &s->sid)) {
+            if (raft->me == s) {
+                raft->me = NULL;
+                /* XXX */
+            }
+            /* XXX raft->leader */
+            /* XXX raft->remove_server */
+            hmap_remove(&raft->servers, &s->hmap_node);
+            VLOG(level, "server "SID_FMT" removed from configuration",
+                 SID_ARGS(&s->sid));
+            raft_server_destroy(s);
+        }
+    }
+
+    HMAP_FOR_EACH_SAFE (s, next, hmap_node, new_servers) {
+        if (!raft_find_server__(&raft->servers, &s->sid)) {
+            VLOG(level, "server "SID_FMT" added to configuration",
+                 SID_ARGS(&s->sid));
+
+            struct raft_server *new = xzalloc(sizeof *new);
+            new->sid = s->sid;
+            new->address = xstrdup(s->address);
+            new->vote = UUID_ZERO;  /* XXX conservative */
+            raft_server_init_leader(raft, new);
+            hmap_insert(&raft->servers, &new->hmap_node, uuid_hash(&new->sid));
+
+            if (uuid_equals(&raft->sid, &new->sid)) {
+                raft->me = new;
+            }
+        }
+    }
+}
+
+static struct raft_entry *
+raft_add_entry(struct raft *raft,
+               uint64_t term, struct json *data, const struct uuid *eid,
+               struct json *servers)
+{
+    if (raft->log_end - raft->log_start >= raft->allocated_log) {
+        raft->entries = x2nrealloc(raft->entries, &raft->allocated_log,
+                                   sizeof *raft->entries);
+    }
+
+    struct raft_entry *entry
+        = &raft->entries[raft->log_end++ - raft->log_start];
+    entry->term = term;
+    entry->data = data;
+    entry->eid = eid ? *eid : UUID_ZERO;
+    entry->servers = servers;
+    return entry;
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_write_entry(struct raft *raft, uint64_t term, struct json *data,
+                 const struct uuid *eid, struct json *servers)
+{
+    /* XXX  when one write fails we need to make all subsequent writes fail (or
+     * just not attempt them) since omitting some writes is fatal */
+
+    raft_add_entry(raft, term, data, eid, servers);
+    struct json *json = raft_entry_to_json_with_index(raft, raft->log_end - 1);
+    struct ovsdb_error *error = ovsdb_log_write(raft->log, json);
+    json_destroy(json);
+
+    if (error) {
+        /* XXX? */
+        raft_entry_destroy(&raft->entries[--raft->log_end]);
+    }
+
+    return error;
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_write_state(struct ovsdb_log *log,
+                 uint64_t term, const struct uuid *vote)
+{
+    struct json *json = json_object_create();
+    raft_put_uint64(json, "term", term);
+    if (vote && !uuid_is_zero(vote)) {
+        json_object_put_format(json, "vote", UUID_FMT, UUID_ARGS(vote));
+    }
+    struct ovsdb_error *error = ovsdb_log_write(log, json);
+    json_destroy(json);
+
+    return error;
+}
+
+static void
+raft_parse_log_record__(struct raft *raft, struct ovsdb_parser *p)
+{
+    if (raft_parse_optional_boolean(p, "left") == 1) {
+        ovsdb_parser_raise_error(
+            p, "server has left the cluster and cannot be added back; use "
+            "\"ovsdb-tool join-cluster\" to add a new server");
+        return;
+    }
+
+    /* All log records include "term", plus at most one of:
+     *
+     *     - "index" plus zero or more of "data" and "servers".
+     *
+     *     - "vote".
+     */
+
+    /* Parse "term".
+     *
+     * A Raft leader can replicate entries from previous terms to the other
+     * servers in the cluster, retaining the original terms on those entries
+     * (see section 3.6.2 "Committing entries from previous terms" for more
+     * information), so it's OK for the term in a log record to precede the
+     * current term. */
+    uint64_t term = raft_parse_uint64(p, "term");
+    if (term > raft->current_term) {
+        raft->current_term = term;
+        raft->voted_for = UUID_ZERO;
+    }
+
+    /* Parse "vote". */
+    struct uuid vote;
+    if (raft_parse_optional_uuid(p, "vote", &vote)) {
+        if (uuid_is_zero(&raft->voted_for)) {
+            raft->voted_for = vote;
+        } else if (!uuid_equals(&raft->voted_for, &vote)) {
+            ovsdb_parser_raise_error(p, "log entry term %"PRIu64 " votes for "
+                                     "both "SID_FMT" and "SID_FMT, term,
+                                     SID_ARGS(&raft->voted_for),
+                                     SID_ARGS(&vote));
+        }
+        return;
+    }
+
+    /* Parse "index". */
+    const struct json *index_json = ovsdb_parser_member(
+        p, "index", OP_INTEGER | OP_OPTIONAL);
+    if (!index_json) {
+        return;
+    }
+    uint64_t index = json_integer(index_json);
+    if (index < raft->log_end) {
+        /* XXX log that the log gets truncated? */
+        raft_truncate(raft, index);
+    } else if (index > raft->log_end) {
+        ovsdb_parser_raise_error(p, "log entry index %"PRIu64" skips past "
+                                 "expected %"PRIu64, index, raft->log_end);
+    }
+
+    /* This log record includes a Raft log entry, as opposed to just advancing
+     * the term or marking a vote.  Therefore, the term must not precede the
+     * term of the previous log entry. */
+    uint64_t prev_term = (raft->log_end > raft->log_start
+                          ? raft->entries[raft->log_end
+                                          - raft->log_start - 1].term
+                          : raft->snap.term);
+    if (term < prev_term) {
+        ovsdb_parser_raise_error(p, "log entry index %"PRIu64" term "
+                                 "%"PRIu64" precedes previous entry's term "
+                                 "%"PRIu64, index, term, prev_term);
+    }
+
+    /* Parse "servers", if present.*/
+    const struct json *servers = ovsdb_parser_member(
+        p, "servers", OP_OBJECT | OP_OPTIONAL);
+    if (servers) {
+        ovsdb_parser_put_error(p, raft_servers_validate_json(servers));
+    }
+
+    /* Parse "data", if present. */
+    const struct json *data = ovsdb_parser_member(
+        p, "data", OP_OBJECT | OP_ARRAY | OP_OPTIONAL);
+    struct uuid eid = data ? raft_parse_required_uuid(p, "eid") : UUID_ZERO;
+
+    /* Add log entry. */
+    if (!ovsdb_parser_has_error(p)) {
+        raft_add_entry(raft, term,
+                       data ? json_clone(data) : NULL, &eid,
+                       servers ? json_clone(servers) : NULL);
+    }
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_parse_log_record(struct raft *raft, const struct json *entry)
+{
+    struct ovsdb_parser p;
+    ovsdb_parser_init(&p, entry, "raft log entry");
+    raft_parse_log_record__(raft, &p);
+    return ovsdb_parser_finish(&p);
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_read_header(struct raft *raft)
+{
+    /* Read header record. */
+    struct json *header;
+    struct ovsdb_error *error = ovsdb_log_read(raft->log, &header);
+    if (error || !header) {
+        /* Report error or end-of-file. */
+        return error;
+    }
+    ovsdb_log_mark_base(raft->log);
+
+    struct ovsdb_parser p;
+    ovsdb_parser_init(&p, header, "raft header");
+
+    /* Parse always-required fields. */
+    raft->sid = raft_parse_required_uuid(&p, "server_id");
+    raft->name = nullable_xstrdup(raft_parse_required_string(&p, "name"));
+    raft->local_address = nullable_xstrdup(
+        raft_parse_required_string(&p, "local_address"));
+
+    /* Parse "remotes", if present.
+     *
+     * If this is present, then this database file is for the special case of a
+     * server that was created with "ovsdb-tool join-cluster" and has not yet
+     * joined its cluster, */
+    const struct json *remote_addresses
+        = ovsdb_parser_member(&p, "remote_addresses", OP_ARRAY | OP_OPTIONAL);
+    if (remote_addresses) {
+        raft->joining = true;
+        error = raft_addresses_from_json(remote_addresses,
+                                         &raft->remote_addresses);
+        if (!error
+            && sset_find_and_delete(&raft->remote_addresses,
+                                    raft->local_address)
+            && sset_is_empty(&raft->remote_addresses)) {
+            error = ovsdb_error(
+                NULL, "at least one remote address (other than the "
+                "local address) is required");
+        }
+    } else {
+        /* Parse required set of servers. */
+        const struct json *servers = ovsdb_parser_member(
+            &p, "prev_servers", OP_OBJECT);
+        error = raft_servers_validate_json(servers);
+        ovsdb_parser_put_error(&p, error);
+        if (!error) {
+            raft->snap.servers = json_clone(servers);
+        }
+
+        /* Parse term, index, and snapshot.  If any of these is present, all of
+         * them must be. */
+        const struct json *snapshot = ovsdb_parser_member(&p, "prev_data",
+                                                          OP_ANY | OP_OPTIONAL);
+        if (snapshot) {
+            raft->snap.eid = raft_parse_required_uuid(&p, "prev_eid");
+            raft->snap.term = raft_parse_uint64(&p, "prev_term");
+            raft->log_start = raft->log_end
+                = raft_parse_uint64(&p, "prev_index") + 1;
+            raft->commit_index = raft->log_start - 1;
+            raft->last_applied = raft->log_start - 2;
+            raft->snap.data = json_clone(snapshot);
+        }
+    }
+
+    /* Parse cluster ID.  If we're joining a cluster, this is optional,
+     * otherwise it is mandatory. */
+    raft_parse_uuid__(&p, "cluster_id", raft->joining, &raft->cid);
+
+    error = ovsdb_parser_finish(&p);
+    json_destroy(header);
+    return error;
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_read_log(struct raft *raft)
+{
+    for (;;) {
+        struct json *entry;
+        struct ovsdb_error *error = ovsdb_log_read(raft->log, &entry);
+        if (!entry) {
+            if (error) {
+                /* We assume that the error is due to a partial write while
+                 * appending to the file before a crash, so log it and
+                 * continue. */
+                char *error_string = ovsdb_error_to_string_free(error);
+                VLOG_WARN("%s", error_string);
+                free(error_string);
+                error = NULL;
+            }
+            break;
+        }
+
+        error = raft_parse_log_record(raft, entry);
+        if (error) {
+            return error;
+        }
+    }
+
+    /* Set the most recent servers. */
+    raft_get_servers_from_log(raft);
+
+    return NULL;
+}
+
+static void
+raft_reset_timer(struct raft *raft)
+{
+    unsigned int duration = (ELECTION_BASE_MSEC
+                             + random_range(ELECTION_RANGE_MSEC));
+    raft->election_base = time_msec();
+    raft->election_timeout = raft->election_base + duration;
+}
+
+static void
+raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
+              const struct uuid *sid, bool incoming)
+{
+    struct raft_conn *conn = xzalloc(sizeof *conn);
+    ovs_list_push_back(&raft->conns, &conn->list_node);
+    conn->js = js;
+    if (sid) {
+        conn->sid = *sid;
+    }
+    conn->incoming = incoming;
+    conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_open(const char *file_name, struct raft **raftp)
+{
+
+    struct ovsdb_log *log;
+    struct ovsdb_error *error;
+
+    *raftp = NULL;
+    error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_READ_WRITE,
+                           -1, &log);
+    return error ? error : raft_open__(log, raftp);
+}
+
+/* Starts the local server in an existing Raft cluster, using the local copy of
+ * the cluster's log in 'file_name'.  Takes ownership of 'log', whether
+ * successful or not. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_open__(struct ovsdb_log *log, struct raft **raftp)
+{
+    struct raft *raft = raft_alloc();
+    raft->log = log;
+
+    raft->fsync_thread_running = true;
+    raft->fsync_thread = ovs_thread_create("raft_fsync",
+                                           raft_fsync_thread, raft);
+
+    struct ovsdb_error *error = raft_read_header(raft);
+    if (error) {
+        goto error;
+    }
+
+    if (!raft->joining) {
+        error = raft_read_log(raft);
+        if (error) {
+            goto error;
+        }
+
+        /* Find our own server.
+         *
+         * XXX It seems that this could fail if the server is restarted during
+         * the process of removing it but before removal is committed, what to
+         * do about that? */
+        raft->me = raft_find_server__(&raft->servers, &raft->sid);
+        if (!raft->me) {
+            error = ovsdb_error(NULL, "server does not belong to cluster");
+            goto error;
+        }
+    } else {
+        raft->join_timeout = time_msec() + 1000;
+    }
+
+    *raftp = raft;
+    hmap_insert(&all_rafts, &raft->hmap_node, hash_string(raft->name, 0));
+    return NULL;
+
+error:
+    raft_close(raft);
+    *raftp = NULL;
+    return error;
+}
+
+const char *
+raft_get_name(const struct raft *raft)
+{
+    return raft->name;
+}
+
+const struct uuid *
+raft_get_cid(const struct raft *raft)
+{
+    return &raft->cid;
+}
+
+const struct uuid *
+raft_get_sid(const struct raft *raft)
+{
+    return &raft->sid;
+}
+
+bool
+raft_is_connected(const struct raft *raft)
+{
+    return (raft->role != RAFT_CANDIDATE
+            && !raft->joining
+            && !raft->leaving
+            && !raft->left);
+}
+
+bool
+raft_is_leader(const struct raft *raft)
+{
+    return raft->role == RAFT_LEADER;
+}
+
+bool
+raft_is_joining(const struct raft *raft)
+{
+    return raft->joining;
+}
+
+static struct raft_conn *
+raft_find_conn_by_sid(struct raft *raft, const struct uuid *sid)
+{
+    if (!uuid_is_zero(sid)) {
+        struct raft_conn *conn;
+        LIST_FOR_EACH (conn, list_node, &raft->conns) {
+            if (uuid_equals(sid, &raft->sid)) {
+                return conn;
+            }
+        }
+    }
+    return NULL;
+}
+
+static struct raft_conn *
+raft_find_conn_by_address(struct raft *raft, const char *address)
+{
+    struct raft_conn *conn;
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
+        if (!strcmp(jsonrpc_session_get_name(conn->js), address)) {
+            return conn;
+        }
+    }
+    return NULL;
+}
+
+/* If we're leader, try to transfer leadership to another server. */
+void
+raft_transfer_leadership(struct raft *raft)
+{
+    if (raft->role != RAFT_LEADER) {
+        return;
+    }
+
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (!uuid_equals(&raft->sid, &s->sid)
+            && s->phase == RAFT_PHASE_STABLE) {
+            struct raft_conn *conn = raft_find_conn_by_sid(raft, &s->sid);
+            if (!conn || !jsonrpc_session_is_connected(conn->js)) {
+                continue;
+            }
+
+            union raft_rpc rpc = {
+                .become_leader = {
+                    .common = {
+                        .type = RAFT_RPC_BECOME_LEADER,
+                        .sid = s->sid,
+                    },
+                    .term = raft->current_term,
+                }
+            };
+            raft_send__(raft, &rpc, conn->js);
+            break;
+        }
+    }
+}
+
+/* Send a RemoveServerRequest to the rest of the servers in the cluster.
+ *
+ * If we know which server is the leader, we can just send the request to it.
+ * However, we might not know which server is the leader, and we might never
+ * find out if the remove request was actually previously committed by a
+ * majority of the servers (because in that case the new leader will not send
+ * AppendRequests or heartbeats to us).  Therefore, we instead send
+ * RemoveRequests to every server.  This theoretically has the same problem, if
+ * the current cluster leader was not previously a member of the cluster, but
+ * it seems likely to be more robust in practice.  */
+static void
+raft_send_remove_server_requests(struct raft *raft)
+{
+    VLOG_INFO("sending remove request (joining=%s, leaving=%s)",
+              raft->joining ? "true" : "false",
+              raft->leaving ? "true" : "false");
+    const struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s != raft->me) {
+            union raft_rpc rpc = (union raft_rpc) {
+                .remove_server_request = {
+                    .common = {
+                        .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
+                        .sid = s->sid,
+                    },
+                    .sid = raft->sid,
+                },
+            };
+            raft_send(raft, &rpc);
+        }
+    }
+
+    raft->leave_timeout = time_msec() + ELECTION_BASE_MSEC;
+}
+
+void
+raft_leave(struct raft *raft)
+{
+    ovs_assert(!raft->joining);
+    if (raft->leaving) {
+        return;
+    }
+
+    VLOG_INFO(SID_FMT": starting to leave cluster "CID_FMT,
+              SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
+    raft->leaving = true;
+    raft_transfer_leadership(raft);
+    raft_become_follower(raft);
+    raft_send_remove_server_requests(raft);
+    raft->leave_timeout = time_msec() + ELECTION_BASE_MSEC;
+}
+
+bool
+raft_is_leaving(const struct raft *raft)
+{
+    return raft->leaving;
+}
+
+bool
+raft_left(const struct raft *raft)
+{
+    return raft->left;
+}
+
+void
+raft_take_leadership(struct raft *raft)
+{
+    if (raft->role != RAFT_LEADER) {
+        raft_start_election(raft, true);
+    }
+}
+
+static void
+raft_close__(struct raft *raft)
+{
+    if (!hmap_node_is_null(&raft->hmap_node)) {
+        hmap_remove(&all_rafts, &raft->hmap_node);
+        hmap_node_nullify(&raft->hmap_node);
+    }
+
+    raft_complete_all_commands(raft, RAFT_CMD_SHUTDOWN);
+
+    struct raft_server *rs = raft->remove_server;
+    if (rs) {
+        raft_send_remove_server_reply__(raft, &rs->sid, &rs->requester_sid,
+                                        rs->requester_conn, false,
+                                        RAFT_SERVER_SHUTDOWN);
+        raft_server_destroy(raft->remove_server);
+    }
+
+    ovs_mutex_lock(&raft->fsync_mutex);
+    raft->fsync_next = UINT64_MAX;
+    ovs_mutex_unlock(&raft->fsync_mutex);
+    seq_change(raft->fsync_request);
+    if (raft->fsync_thread_running) {
+        xpthread_join(raft->fsync_thread, NULL);
+        raft->fsync_thread_running = false;
+    }
+
+    struct raft_conn *conn, *next;
+    LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) {
+        jsonrpc_session_close(conn->js);
+        ovs_list_remove(&conn->list_node);
+        free(conn);
+    }
+}
+
+void
+raft_close(struct raft *raft)
+{
+    if (!raft) {
+        return;
+    }
+
+    raft_transfer_leadership(raft);
+
+    raft_close__(raft);
+
+    ovsdb_log_close(raft->log);
+
+    raft_servers_destroy(&raft->servers);
+
+    for (uint64_t index = raft->log_start; index < raft->log_end; index++) {
+        struct raft_entry *e = &raft->entries[index - raft->log_start];
+        raft_entry_destroy(e);
+    }
+    free(raft->entries);
+
+    raft_entry_destroy(&raft->snap);
+
+    raft_servers_destroy(&raft->add_servers);
+
+    sset_destroy(&raft->remote_addresses);
+    free(raft->local_address);
+    free(raft->name);
+
+    free(raft);
+}
+
+static bool
+raft_receive_rpc(struct raft *raft, struct jsonrpc_session *js,
+                 struct uuid *sid, union raft_rpc *rpc)
+{
+    struct jsonrpc_msg *msg = jsonrpc_session_recv(js);
+    if (!msg) {
+        return false;
+    }
+
+    struct ovsdb_error *error = raft_rpc_from_jsonrpc(&raft->cid, &raft->sid,
+                                                      msg, rpc);
+    if (error) {
+        char *s = ovsdb_error_to_string_free(error);
+        VLOG_INFO("%s: %s", jsonrpc_session_get_name(js), s);
+        free(s);
+        return false;
+    }
+
+    if (uuid_is_zero(sid)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        *sid = rpc->common.sid;
+        VLOG_INFO_RL(&rl, "%s: learned server ID "SID_FMT,
+                     jsonrpc_session_get_name(js), SID_ARGS(sid));
+    } else if (!uuid_equals(sid, &rpc->common.sid)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_WARN_RL(&rl,
+                     "%s: remote server ID changed from "SID_FMT" to "SID_FMT,
+                     jsonrpc_session_get_name(js),
+                     SID_ARGS(sid), SID_ARGS(&rpc->common.sid));
+    }
+
+    return true;
+}
+
+static void
+raft_send_add_server_request(struct raft *raft, struct jsonrpc_session *js)
+{
+    union raft_rpc rq = {
+        .add_server_request = {
+            .common = {
+                .type = RAFT_RPC_ADD_SERVER_REQUEST,
+                .sid = UUID_ZERO,
+                .comment = NULL,
+            },
+            .address = raft->local_address,
+        },
+    };
+    raft_send__(raft, &rq, js);
+}
+
+static void
+log_rpc(const struct raft *raft, const union raft_rpc *rpc,
+        const char *direction)
+{
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600);
+    if (!raft_rpc_is_heartbeat(rpc) && !VLOG_DROP_DBG(&rl)) {
+        struct ds s = DS_EMPTY_INITIALIZER;
+        raft_rpc_format(rpc, &s);
+        VLOG_DBG(SID_FMT"%s%s", SID_ARGS(&raft->sid), direction,
+                 ds_cstr(&s));
+        ds_destroy(&s);
+    }
+}
+
+static void
+raft_run_session(struct raft *raft,
+                 struct jsonrpc_session *js, unsigned int *seqno,
+                 struct uuid *sid)
+{
+    jsonrpc_session_run(js);
+
+    bool just_connected = false;
+    if (seqno) {
+        unsigned int new_seqno = jsonrpc_session_get_seqno(js);
+        if (new_seqno != *seqno && jsonrpc_session_is_connected(js)) {
+            *seqno = new_seqno;
+            just_connected = true;
+        }
+    }
+
+    if (raft->joining
+        && (just_connected || time_msec() >= raft->join_timeout)) {
+        raft_send_add_server_request(raft, js);
+    } else if (just_connected) {
+        if (raft->leaving) {
+            union raft_rpc rq = {
+                .remove_server_request = {
+                    .common = {
+                        .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
+                        .sid = *sid,
+                    },
+                    .sid = raft->sid,
+                },
+            };
+            raft_send__(raft, &rq, js);
+        } else {
+            union raft_rpc rq = (union raft_rpc) {
+                .common = {
+                    .type = RAFT_RPC_HELLO_REQUEST,
+                    .sid = *sid,
+                },
+            };
+            raft_send__(raft, &rq, js);
+        }
+    }
+
+    for (size_t i = 0; i < 50; i++) {
+        union raft_rpc rpc;
+        if (!raft_receive_rpc(raft, js, sid, &rpc)) {
+            break;
+        }
+
+        log_rpc(raft, &rpc, "<--");
+        raft_handle_rpc(raft, &rpc);
+        raft_rpc_destroy(&rpc);
+    }
+}
+
+static void
+raft_waiter_complete(struct raft *raft, struct raft_waiter *w)
+{
+    switch (w->type) {
+    case RAFT_W_COMMAND:
+        if (raft->role == RAFT_LEADER) {
+            raft_update_match_index(raft, raft->me, w->command.index);
+        }
+        break;
+
+    case RAFT_W_APPEND:
+        raft_send_append_reply(raft, w->append.rq, RAFT_APPEND_OK,
+                               "log updated");
+        break;
+
+    case RAFT_W_VOTE:
+        if (!uuid_is_zero(&raft->voted_for)
+            && !uuid_equals(&raft->voted_for, &raft->sid)) {
+            raft_send_vote_reply(raft, &raft->voted_for, &raft->voted_for);
+        }
+        break;
+    }
+}
+
+static void
+raft_waiter_destroy(struct raft_waiter *w)
+{
+    if (!w) {
+        return;
+    }
+
+    switch (w->type) {
+    case RAFT_W_COMMAND:
+        raft_command_unref(w->command.cmd);
+        break;
+
+    case RAFT_W_APPEND:
+        free(w->append.rq);
+        break;
+
+    case RAFT_W_VOTE:
+        break;
+    }
+    free(w);
+}
+
+static void
+raft_waiters_run(struct raft *raft)
+{
+    if (ovs_list_is_empty(&raft->waiters)) {
+        return;
+    }
+
+    ovs_mutex_lock(&raft->fsync_mutex);
+    uint64_t cur = raft->fsync_cur;
+    ovs_mutex_unlock(&raft->fsync_mutex);
+
+    struct raft_waiter *w, *next;
+    LIST_FOR_EACH_SAFE (w, next, list_node, &raft->waiters) {
+        if (cur < w->fsync_seqno) {
+            break;
+        }
+        raft_waiter_complete(raft, w);
+        ovs_list_remove(&w->list_node);
+        raft_waiter_destroy(w);
+    }
+}
+
+static void
+raft_waiters_wait(struct raft *raft)
+{
+    if (ovs_list_is_empty(&raft->waiters)) {
+        return;
+    }
+
+    uint64_t complete = seq_read(raft->fsync_complete);
+
+    ovs_mutex_lock(&raft->fsync_mutex);
+    uint64_t cur = raft->fsync_cur;
+    ovs_mutex_unlock(&raft->fsync_mutex);
+
+    struct raft_waiter *w, *next;
+    LIST_FOR_EACH_SAFE (w, next, list_node, &raft->waiters) {
+        if (cur < w->fsync_seqno) {
+            seq_wait(raft->fsync_complete, complete);
+        } else {
+            poll_immediate_wake();
+        }
+        break;
+    }
+}
+
+static void
+raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote)
+{
+    struct ovsdb_error *error = raft_write_state(raft->log, term, vote);
+    if (error) {
+        /* XXX */
+    }
+    /* XXX need to commit before replying */
+    raft->current_term = term;
+    raft->voted_for = vote ? *vote : UUID_ZERO;
+}
+
+static void
+raft_accept_vote(struct raft *raft, struct raft_server *s,
+                 const struct uuid *vote)
+{
+    if (uuid_equals(&s->vote, vote)) {
+        return;
+    }
+    if (!uuid_is_zero(&s->vote)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+        VLOG_WARN_RL(&rl, "server "SID_FMT" changed its vote from "SID_FMT
+                     " to "SID_FMT, SID_ARGS(&s->sid),
+                     SID_ARGS(&s->vote), SID_ARGS(vote));
+    }
+    s->vote = *vote;
+    if (uuid_equals(vote, &raft->sid)
+        && ++raft->n_votes > hmap_count(&raft->servers) / 2) {
+        raft_become_leader(raft);
+    }
+}
+
+static void
+raft_start_election(struct raft *raft, bool leadership_transfer)
+{
+    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
+
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+    ovs_assert(raft->role != RAFT_LEADER);
+    ovs_assert(hmap_is_empty(&raft->commands));
+    raft->role = RAFT_CANDIDATE;
+
+    /* XXX what if we're not part of the server set? */
+
+    raft_set_term(raft, raft->current_term + 1, &raft->sid);
+    raft->n_votes = 0;
+
+    if (!VLOG_DROP_INFO(&rl)) {
+        long long int now = time_msec();
+        if (now >= raft->election_timeout) {
+            VLOG_INFO("term %"PRIu64": %lld ms timeout expired, "
+                      "starting election",
+                      raft->current_term, now - raft->election_base);
+        } else {
+            VLOG_INFO("term %"PRIu64": starting election", raft->current_term);
+        }
+    }
+    raft_reset_timer(raft);
+
+    struct raft_server *peer;
+    HMAP_FOR_EACH (peer, hmap_node, &raft->servers) {
+        peer->vote = UUID_ZERO;
+        if (peer == raft->me) {
+            continue;
+        }
+
+        union raft_rpc rq = {
+            .vote_request = {
+                .common = {
+                    .type = RAFT_RPC_VOTE_REQUEST,
+                    .sid = peer->sid,
+                },
+                .term = raft->current_term,
+                .last_log_index = raft->log_end - 1,
+                .last_log_term = (
+                    raft->log_end > raft->log_start
+                    ? raft->entries[raft->log_end - raft->log_start - 1].term
+                    : raft->snap.term),
+                .leadership_transfer = leadership_transfer,
+            },
+        };
+        raft_send(raft, &rq);
+    }
+
+    /* Vote for ourselves.
+     * XXX only if we're not being removed? */
+    raft_accept_vote(raft, raft->me, &raft->sid);
+
+    /* XXX how do we handle outstanding waiters? */
+}
+
+static void
+raft_open_conn(struct raft *raft, const char *address, const struct uuid *sid)
+{
+    if (strcmp(address, raft->local_address)
+        && !raft_find_conn_by_address(raft, address)) {
+        raft_add_conn(raft, jsonrpc_session_open(address, true), sid, false);
+    }
+}
+
+void
+raft_run(struct raft *raft)
+{
+    if (raft->left) {
+        return;
+    }
+
+    raft_waiters_run(raft);
+
+    if (!raft->listener && time_msec() >= raft->listen_backoff) {
+        char *paddr = raft_make_address_passive(raft->local_address);
+        int error = pstream_open(paddr, &raft->listener, DSCP_DEFAULT);
+        if (error) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+            VLOG_WARN_RL(&rl, "%s: listen failed (%s)",
+                         paddr, ovs_strerror(error));
+            raft->listen_backoff = time_msec() + 1000;
+        }
+        free(paddr);
+    }
+
+    if (raft->listener) {
+        struct stream *stream;
+        int error = pstream_accept(raft->listener, &stream);
+        if (!error) {
+            raft_add_conn(raft, jsonrpc_session_open_unreliably(
+                              jsonrpc_open(stream), DSCP_DEFAULT), NULL,
+                          true);
+        } else if (error != EAGAIN) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+            VLOG_WARN_RL(&rl, "%s: accept failed: %s",
+                         pstream_get_name(raft->listener),
+                         ovs_strerror(error));
+        }
+    }
+
+    /* Run RPCs for all open sessions. */
+    struct raft_conn *conn;
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
+        raft_run_session(raft, conn->js, &conn->js_seqno, &conn->sid);
+    }
+
+    /* Close unneeded sessions. */
+    struct raft_conn *next;
+    LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) {
+        /* Keep all incoming sessions (that are still alive) and outgoing
+         * sessions to a server in the current set or, if we're joining, to one
+         * of the remote addresses. */
+        if (jsonrpc_session_is_alive(conn->js)
+            && (conn->incoming
+                || raft_find_server(raft, &conn->sid)
+                || (raft->joining
+                    && sset_contains(&raft->remote_addresses,
+                                     jsonrpc_session_get_name(conn->js))))) {
+            continue;
+        }
+
+        jsonrpc_session_close(conn->js);
+        ovs_list_remove(&conn->list_node);
+        free(conn);
+    }
+
+    /* Open needed sessions. */
+    struct raft_server *server;
+    HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
+        raft_open_conn(raft, server->address, &server->sid);
+    }
+    if (raft->joining) {
+        const char *address;
+        SSET_FOR_EACH (address, &raft->remote_addresses) {
+            raft_open_conn(raft, address, NULL);
+        }
+    }
+
+    if (!raft->joining && time_msec() >= raft->election_timeout) {
+        raft_start_election(raft, false);
+    }
+
+    if (raft->leaving && time_msec() >= raft->leave_timeout) {
+        raft_send_remove_server_requests(raft);
+    }
+
+    if (raft->joining && time_msec() >= raft->join_timeout) {
+        raft->join_timeout = time_msec() + 1000;
+    }
+
+    if (time_msec() >= raft->ping_timeout) {
+        if (raft->role == RAFT_LEADER) {
+            /* XXX send only if idle */
+            raft_send_heartbeats(raft);
+        } else {
+            long long int now = time_msec();
+            struct raft_command *cmd, *next;
+            HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
+                if (cmd->timestamp
+                    && now - cmd->timestamp > ELECTION_BASE_MSEC) {
+                    raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
+                }
+            }
+        }
+        raft->ping_timeout = time_msec() + PING_TIME_MSEC;
+    }
+
+    /* Do this only at the end; if we did it as soon as we set raft->left in
+     * handling the RemoveServerReply, then it could easily cause references to
+     * freed memory in RPC sessions, etc. */
+    if (raft->left) {
+        raft_close__(raft);
+    }
+}
+
+static void
+raft_wait_session(struct jsonrpc_session *js)
+{
+    if (js) {
+        jsonrpc_session_wait(js);
+        jsonrpc_session_recv_wait(js);
+    }
+}
+
+void
+raft_wait(struct raft *raft)
+{
+    if (raft->left) {
+        return;
+    }
+
+    raft_waiters_wait(raft);
+
+    if (raft->listener) {
+        pstream_wait(raft->listener);
+    } else {
+        poll_timer_wait_until(raft->listen_backoff);
+    }
+
+    struct raft_conn *conn;
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
+        raft_wait_session(conn->js);
+    }
+
+    if (!raft->joining) {
+        poll_timer_wait_until(raft->election_timeout);
+    } else {
+        poll_timer_wait_until(raft->join_timeout);
+    }
+    if (raft->leaving) {
+        poll_timer_wait_until(raft->leave_timeout);
+    }
+    if (raft->role == RAFT_LEADER || !hmap_is_empty(&raft->commands)) {
+        poll_timer_wait_until(raft->ping_timeout);
+    }
+}
+
+static struct raft_waiter *
+raft_waiter_create(struct raft *raft, enum raft_waiter_type type)
+{
+    ovs_mutex_lock(&raft->fsync_mutex);
+    uint64_t seqno = ++raft->fsync_next;
+    ovs_mutex_unlock(&raft->fsync_mutex);
+
+    seq_change(raft->fsync_request);
+
+    struct raft_waiter *w = xzalloc(sizeof *w);
+    ovs_list_push_back(&raft->waiters, &w->list_node);
+    w->fsync_seqno = seqno;
+    w->type = type;
+    return w;
+}
+
+const char *
+raft_command_status_to_string(enum raft_command_status status)
+{
+    switch (status) {
+    case RAFT_CMD_INCOMPLETE:
+        return "operation still in progress";
+    case RAFT_CMD_SUCCESS:
+        return "success";
+    case RAFT_CMD_NOT_LEADER:
+        return "not leader";
+    case RAFT_CMD_BAD_PREREQ:
+        return "prerequisite check failed";
+    case RAFT_CMD_LOST_LEADERSHIP:
+        return "lost leadership";
+    case RAFT_CMD_SHUTDOWN:
+        return "server shutdown";
+    case RAFT_CMD_IO_ERROR:
+        return "I/O error";
+    case RAFT_CMD_TIMEOUT:
+        return "timeout";
+    default:
+        return NULL;
+    }
+}
+
+bool
+raft_command_status_from_string(const char *s,
+                                enum raft_command_status *statusp)
+{
+    for (enum raft_command_status status = 0; ; status++) {
+        const char *s2 = raft_command_status_to_string(status);
+        if (!s2) {
+            *statusp = 0;
+            return false;
+        } else if (!strcmp(s, s2)) {
+            *statusp = status;
+            return true;
+        }
+    }
+}
+
+static const struct uuid *
+raft_current_eid(const struct raft *raft)
+{
+    for (uint64_t index = raft->log_end - 1; index >= raft->log_start;
+         index--) {
+        struct raft_entry *e = &raft->entries[index - raft->log_start];
+        if (e->data) {
+            return &e->eid;
+        }
+    }
+    return &raft->snap.eid;
+}
+
+static struct raft_command *
+raft_command_create_completed(enum raft_command_status status)
+{
+    ovs_assert(status != RAFT_CMD_INCOMPLETE);
+
+    struct raft_command *cmd = xzalloc(sizeof *cmd);
+    cmd->n_refs = 1;
+    cmd->status = status;
+    return cmd;
+}
+
+static struct raft_command *
+raft_command_create_incomplete(struct raft *raft, uint64_t index)
+{
+    struct raft_command *cmd = xzalloc(sizeof *cmd);
+    cmd->n_refs = 2;            /* One for client, one for raft->commands. */
+    cmd->index = index;
+    cmd->status = RAFT_CMD_INCOMPLETE;
+    hmap_insert(&raft->commands, &cmd->hmap_node, cmd->index);
+    return cmd;
+}
+
+static struct raft_command * OVS_WARN_UNUSED_RESULT
+raft_command_initiate(struct raft *raft,
+                      const struct json *data, const struct json *servers,
+                      const struct uuid *eid)
+{
+    /* Write to local log. */
+    uint64_t index = raft->log_end;
+    struct ovsdb_error *error = raft_write_entry(
+        raft, raft->current_term, json_nullable_clone(data), eid,
+        json_nullable_clone(servers));
+    if (error) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        char *s = ovsdb_error_to_string_free(error);
+        VLOG_WARN_RL(&rl, "%s", s);
+        free(s);
+
+        if (hmap_count(&raft->servers) < 3) {
+            return raft_command_create_completed(RAFT_CMD_IO_ERROR);
+        }
+    }
+
+    struct raft_command *cmd = raft_command_create_incomplete(raft, index);
+    cmd->eid = *eid;
+
+    if (!error) {
+        raft_waiter_create(raft, RAFT_W_COMMAND)->command.index = cmd->index;
+    }
+
+    /* Write to remote logs. */
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s != raft->me && s->next_index == index) {
+            raft_send_append_request(raft, s, 1, "execute command");
+            s->next_index++;    /* XXX Is this a valid way to pipeline? */
+        }
+    }
+
+    return cmd;
+}
+
+static struct raft_command * OVS_WARN_UNUSED_RESULT
+raft_command_execute__(struct raft *raft,
+                       const struct json *data, const struct json *servers,
+                       const struct uuid *prereq, struct uuid *result)
+{
+    if (raft->role != RAFT_LEADER) {
+        /* Consider proxying the command to the leader.  We can only do that if
+         * we know the leader and the command does not change the set of
+         * servers.  We do not proxy commands without prerequisites, even
+         * though we could, because in an OVSDB context a log entry doesn't
+         * make sense without context. */
+        if (servers || !data
+            || raft->role != RAFT_FOLLOWER || uuid_is_zero(&raft->leader_sid)
+            || !prereq) {
+            return raft_command_create_completed(RAFT_CMD_NOT_LEADER);
+        }
+    }
+
+    struct uuid eid = data ? uuid_random() : UUID_ZERO;
+    if (result) {
+        *result = eid;
+    }
+
+    if (raft->role != RAFT_LEADER) {
+        const union raft_rpc rpc = {
+            .execute_command_request = {
+                .common = {
+                    .type = RAFT_RPC_EXECUTE_COMMAND_REQUEST,
+                    .sid = raft->leader_sid,
+                },
+                .data = CONST_CAST(struct json *, data),
+                .prereq = *prereq,
+                .result = eid,
+            }
+        };
+        if (!raft_send(raft, &rpc)) {
+            /* Couldn't send command, so it definitely failed. */
+            return raft_command_create_completed(RAFT_CMD_NOT_LEADER);
+        }
+
+        struct raft_command *cmd = raft_command_create_incomplete(raft, 0);
+        cmd->timestamp = time_msec();
+        cmd->eid = eid;
+        return cmd;
+    }
+
+    const struct uuid *current_eid = raft_current_eid(raft);
+    if (prereq && !uuid_equals(prereq, current_eid)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match "
+                     "prerequisite "UUID_FMT,
+                     UUID_ARGS(current_eid), UUID_ARGS(prereq));
+        return raft_command_create_completed(RAFT_CMD_BAD_PREREQ);
+    }
+
+    return raft_command_initiate(raft, data, servers, &eid);
+}
+
+struct raft_command * OVS_WARN_UNUSED_RESULT
+raft_command_execute(struct raft *raft, const struct json *data,
+                     const struct uuid *prereq, struct uuid *result)
+{
+    return raft_command_execute__(raft, data, NULL, prereq, result);
+}
+
+enum raft_command_status
+raft_command_get_status(const struct raft_command *cmd)
+{
+    ovs_assert(cmd->n_refs > 0);
+    return cmd->status;
+}
+
+void
+raft_command_unref(struct raft_command *cmd)
+{
+    if (cmd) {
+        ovs_assert(cmd->n_refs > 0);
+        if (!--cmd->n_refs) {
+            free(cmd);
+        }
+    }
+}
+
+void
+raft_command_wait(const struct raft_command *cmd)
+{
+    if (cmd->status != RAFT_CMD_INCOMPLETE) {
+        poll_immediate_wake();
+    }
+}
+
+static void
+raft_command_complete(struct raft *raft,
+                      struct raft_command *cmd,
+                      enum raft_command_status status)
+{
+    if (!uuid_is_zero(&cmd->sid)) {
+        raft_send_execute_command_reply(raft, &cmd->sid, &cmd->eid, status);
+    }
+
+    ovs_assert(cmd->status == RAFT_CMD_INCOMPLETE);
+    ovs_assert(cmd->n_refs > 0);
+    hmap_remove(&raft->commands, &cmd->hmap_node);
+    cmd->status = status;
+    raft_command_unref(cmd);
+}
+
+static void
+raft_complete_all_commands(struct raft *raft, enum raft_command_status status)
+{
+    struct raft_command *cmd, *next;
+    HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
+        raft_command_complete(raft, cmd, status);
+    }
+}
+
+static struct raft_command *
+raft_find_command_by_index(struct raft *raft, uint64_t index)
+{
+    struct raft_command *cmd;
+
+    HMAP_FOR_EACH_IN_BUCKET (cmd, hmap_node, index, &raft->commands) {
+        if (cmd->index == index) {
+            return cmd;
+        }
+    }
+    return NULL;
+}
+
+static struct raft_command *
+raft_find_command_by_eid(struct raft *raft, const struct uuid *eid)
+{
+    struct raft_command *cmd;
+
+    HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) {
+        if (uuid_equals(&cmd->eid, eid)) {
+            return cmd;
+        }
+    }
+    return NULL;
+}
+
+
+/* raft_rpc_to/from_jsonrpc(). */
+
+
+#define RAFT_RPC(ENUM, NAME) \
+    static void raft_handle_##NAME(struct raft *, const struct raft_##NAME *);
+RAFT_RPC_TYPES
+#undef RAFT_RPC
+
+static void
+raft_handle_hello_request(struct raft *raft OVS_UNUSED,
+                          const struct raft_hello_request *hello OVS_UNUSED)
+{
+}
+
+/* 'type' is RAFT_RPC_ADD_SERVER_REPLY or RAFT_RPC_REMOVE_SERVER_REPLY,
+ * 'target_sid' is the server being added or removed.
+ * 'requester_sid' is the server that requested the action and should receive
+ * the reply.
+ * A server can only add itself, so for adds 'target_sid' and 'requester_sid'
+ * are the same; a server can remove other servers and so they may differ
+ * in that case.*/
+
+static void
+raft_send_add_server_reply__(struct raft *raft, const struct uuid *sid,
+                             const char *address,
+                             bool success, const char *comment)
+{
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+    if (!VLOG_DROP_INFO(&rl)) {
+        struct ds s = DS_EMPTY_INITIALIZER;
+        ds_put_format(&s, "adding "SID_FMT" (%s) to cluster "CID_FMT" %s",
+                      SID_ARGS(sid), address,
+                      CID_ARGS(&raft->cid),
+                      success ? "succeeded" : "failed");
+        if (comment) {
+            ds_put_format(&s, " (%s)", comment);
+        }
+        VLOG_INFO("%s", ds_cstr(&s));
+        ds_destroy(&s);
+    }
+
+    union raft_rpc rpy = {
+        .add_server_reply = {
+            .common = {
+                .type = RAFT_RPC_ADD_SERVER_REPLY,
+                .sid = *sid,
+                .comment = CONST_CAST(char *, comment),
+            },
+            .success = success,
+        }
+    };
+
+    struct sset *remote_addresses = &rpy.add_server_reply.remote_addresses;
+    sset_init(remote_addresses);
+    if (!raft->joining) {
+        struct raft_server *s;
+        HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+            if (s != raft->me) {
+                sset_add(remote_addresses, s->address);
+            }
+        }
+    }
+
+    raft_send(raft, &rpy);
+
+    sset_destroy(remote_addresses);
+}
+
+static void
+raft_send_remove_server_reply_rpc(struct raft *raft, const struct uuid *sid,
+                                  bool success, const char *comment)
+{
+    const union raft_rpc rpy = {
+        .remove_server_reply = {
+            .common = {
+                .type = RAFT_RPC_REMOVE_SERVER_REPLY,
+                .sid = *sid,
+                .comment = CONST_CAST(char *, comment),
+            },
+            .success = success,
+        }
+    };
+    raft_send(raft, &rpy);
+}
+
+static void
+raft_send_remove_server_reply__(struct raft *raft,
+                                const struct uuid *target_sid,
+                                const struct uuid *requester_sid,
+                                struct unixctl_conn *requester_conn,
+                                bool success, const char *comment)
+{
+    struct ds s = DS_EMPTY_INITIALIZER;
+    ds_put_format(&s, "request ");
+    if (!uuid_is_zero(requester_sid)) {
+        ds_put_format(&s, "by "SID_FMT, SID_ARGS(requester_sid));
+    } else {
+        ds_put_cstr(&s, "via unixctl");
+    }
+    ds_put_cstr(&s, " to remove ");
+    if (!requester_conn && uuid_equals(target_sid, requester_sid)) {
+        ds_put_cstr(&s, "itself");
+    } else {
+        ds_put_format(&s, SID_FMT, SID_ARGS(target_sid));
+    }
+    ds_put_format(&s, " from cluster "CID_FMT" %s",
+                  CID_ARGS(&raft->cid),
+                  success ? "succeeded" : "failed");
+    if (comment) {
+        ds_put_format(&s, " (%s)", comment);
+    }
+
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+    VLOG_INFO_RL(&rl, "%s", ds_cstr(&s));
+
+    /* Send RemoveServerReply to the requester (which could be a server or a
+     * unixctl connection.  Also always send it to the removed server; this
+     * allows it to be sure that it's really removed and update its log and
+     * disconnect permanently.  */
+    if (!uuid_is_zero(requester_sid)) {
+        raft_send_remove_server_reply_rpc(raft, requester_sid,
+                                          success, comment);
+    }
+    if (!uuid_equals(requester_sid, target_sid)) {
+        raft_send_remove_server_reply_rpc(raft, target_sid, success, comment);
+    }
+    if (requester_conn) {
+        if (success) {
+            unixctl_command_reply(requester_conn, ds_cstr(&s));
+        } else {
+            unixctl_command_reply_error(requester_conn, ds_cstr(&s));
+        }
+    }
+
+    ds_destroy(&s);
+}
+
+static void
+raft_send_add_server_reply(struct raft *raft,
+                           const struct raft_add_server_request *rq,
+                           bool success, const char *comment)
+{
+    return raft_send_add_server_reply__(raft, &rq->common.sid, rq->address,
+                                        success, comment);
+}
+
+static void
+raft_send_remove_server_reply(struct raft *raft,
+                              const struct raft_remove_server_request *rq,
+                              bool success, const char *comment)
+{
+    return raft_send_remove_server_reply__(raft, &rq->sid, &rq->common.sid,
+                                           rq->requester_conn, success,
+                                           comment);
+}
+
+static void
+raft_become_follower(struct raft *raft)
+{
+    raft->leader_sid = UUID_ZERO;
+    if (raft->role == RAFT_FOLLOWER) {
+        return;
+    }
+
+    raft->role = RAFT_FOLLOWER;
+    raft_reset_timer(raft);
+
+    /* Notify clients about lost leadership.
+     *
+     * We do not reverse our changes to 'raft->servers' because the new
+     * configuration is already part of the log.  Possibly the configuration
+     * log entry will not be committed, but until we know that we must use the
+     * new configuration.  Our AppendEntries processing will properly update
+     * the server configuration later, if necessary. */
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
+        raft_send_add_server_reply__(raft, &s->sid, s->address, false,
+                                     RAFT_SERVER_LOST_LEADERSHIP);
+    }
+    if (raft->remove_server) {
+        raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
+                                        &raft->remove_server->requester_sid,
+                                        raft->remove_server->requester_conn,
+                                        false, RAFT_SERVER_LOST_LEADERSHIP);
+        raft_server_destroy(raft->remove_server);
+        raft->remove_server = NULL;
+    }
+
+    /* XXX how do we handle outstanding waiters? */
+    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
+}
+
+static void
+raft_send_append_request(struct raft *raft,
+                         struct raft_server *peer, unsigned int n,
+                         const char *comment)
+{
+    ovs_assert(raft->role == RAFT_LEADER);
+
+    const union raft_rpc rq = {
+        .append_request = {
+            .common = {
+                .type = RAFT_RPC_APPEND_REQUEST,
+                .sid = peer->sid,
+                .comment = CONST_CAST(char *, comment),
+            },
+            .term = raft->current_term,
+            .prev_log_index = peer->next_index - 1,
+            .prev_log_term = (peer->next_index - 1 >= raft->log_start
+                              ? raft->entries[peer->next_index - 1
+                                              - raft->log_start].term
+                              : raft->snap.term),
+            .leader_commit = raft->commit_index,
+            .entries = &raft->entries[peer->next_index - raft->log_start],
+            .n_entries = n,
+        },
+    };
+    raft_send(raft, &rq);
+}
+
+static void
+raft_send_heartbeats(struct raft *raft)
+{
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s != raft->me) {
+            /* XXX should also retransmit unacknowledged append requests */
+            raft_send_append_request(raft, s, 0, "heartbeat");
+        }
+    }
+
+    /* Send anyone waiting for a command to complete a ping to let them
+     * know we're still working on it. */
+    struct raft_command *cmd;
+    HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) {
+        if (!uuid_is_zero(&cmd->sid)) {
+            raft_send_execute_command_reply(raft, &cmd->sid,
+                                            &cmd->eid,
+                                            RAFT_CMD_INCOMPLETE);
+        }
+    }
+}
+
+static void
+raft_server_init_leader(struct raft *raft, struct raft_server *s)
+{
+    s->next_index = raft->log_end;
+    s->match_index = 0;
+    s->phase = RAFT_PHASE_STABLE;
+}
+
+static void
+raft_become_leader(struct raft *raft)
+{
+    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
+
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+    VLOG_INFO_RL(&rl, "term %"PRIu64": elected leader by %d+ of "
+                 "%"PRIuSIZE" servers", raft->current_term,
+                 raft->n_votes, hmap_count(&raft->servers));
+
+    ovs_assert(raft->role != RAFT_LEADER);
+    raft->role = RAFT_LEADER;
+    raft->leader_sid = raft->sid;
+    raft->election_timeout = LLONG_MAX;
+    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
+
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        raft_server_init_leader(raft, s);
+    }
+
+    raft_update_match_index(raft, raft->me, raft->log_end - 1);
+    raft_send_heartbeats(raft);
+
+    /* XXX Initiate a no-op commit.  Otherwise we might never find out what's
+     * in the log.  See section 6.4 item 1. */
+    raft_command_unref(raft_command_execute__(raft, NULL, NULL, NULL, NULL));
+}
+
+/* Processes term 'term' received as part of RPC 'common'.  Returns true if the
+ * caller should continue processing the RPC, false if the caller should reject
+ * it due to a stale term. */
+static bool
+raft_receive_term__(struct raft *raft, const struct raft_rpc_common *common,
+                    uint64_t term)
+{
+    /* Section 3.3 says:
+     *
+     *     Current terms are exchanged whenever servers communicate; if one
+     *     server’s current term is smaller than the other’s, then it updates
+     *     its current term to the larger value.  If a candidate or leader
+     *     discovers that its term is out of date, it immediately reverts to
+     *     follower state.  If a server receives a request with a stale term
+     *     number, it rejects the request.
+     */
+    if (term > raft->current_term) {
+        raft_set_term(raft, term, NULL);
+        raft_become_follower(raft);
+    } else if (term < raft->current_term) {
+        VLOG_INFO("rejecting term %"PRIu64" < current term %"PRIu64" received "
+                  "in %s message from server "SID_FMT,
+                  term, raft->current_term,
+                  raft_rpc_type_to_string(common->type),
+                  SID_ARGS(&common->sid));
+        return false;
+    }
+    return true;
+}
+
+static void
+raft_get_servers_from_log(struct raft *raft)
+{
+    const struct json *servers_json = raft->snap.servers;
+    for (uint64_t index = raft->log_end - 1; index >= raft->log_start;
+         index--) {
+        struct raft_entry *e = &raft->entries[index - raft->log_start];
+        if (e->servers) {
+            servers_json = e->servers;
+            break;
+        }
+    }
+
+    struct hmap servers;
+    struct ovsdb_error *error = raft_servers_from_json(servers_json, &servers);
+    ovs_assert(!error);
+    raft_set_servers(raft, &servers, VLL_INFO);
+    raft_servers_destroy(&servers);
+}
+
+/* Truncates the log, so that raft->log_end becomes 'new_end'.
+ *
+ * Doesn't write anything to disk.  (XXX need to truncate log?)
+ *
+ * Returns true if any of the removed log entries were server configuration
+ * entries, false otherwise. */
+static bool
+raft_truncate(struct raft *raft, uint64_t new_end)
+{
+    ovs_assert(new_end >= raft->log_start);
+
+    bool servers_changed = false;
+    while (raft->log_end > new_end) {
+        struct raft_entry *entry = &raft->entries[--raft->log_end
+                                                  - raft->log_start];
+        if (entry->servers) {
+            servers_changed = true;
+        }
+        raft_entry_destroy(entry);
+    }
+    return servers_changed;
+}
+
+static const struct json *
+raft_peek_next_entry(struct raft *raft, struct uuid *eid)
+{
+    /* Invariant: log_start - 2 <= last_applied <= commit_index < log_end. */
+    ovs_assert(raft->log_start <= raft->last_applied + 2);
+    ovs_assert(raft->last_applied <= raft->commit_index);
+    ovs_assert(raft->commit_index < raft->log_end);
+
+    if (raft->joining) {        /* XXX needed? */
+        return NULL;
+    }
+
+    if (raft->log_start == raft->last_applied + 2) {
+        *eid = raft->snap.eid;
+        return raft->snap.data;
+    }
+
+    while (raft->last_applied < raft->commit_index) {
+        const struct raft_entry *e = raft_get_entry(raft,
+                                                    raft->last_applied + 1);
+        if (e->data) {
+            *eid = e->eid;
+            return e->data;
+        }
+        raft->last_applied++;
+    }
+    return NULL;
+}
+
+static const struct json *
+raft_get_next_entry(struct raft *raft, struct uuid *eid)
+{
+    const struct json *data = raft_peek_next_entry(raft, eid);
+    if (data) {
+        raft->last_applied++;
+    }
+    return data;
+}
+
+static void
+raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
+{
+    ovs_assert(new_commit_index >= raft->commit_index);
+    if (raft->role != RAFT_LEADER) {
+        raft->commit_index = new_commit_index;
+        return;
+    }
+
+    while (raft->commit_index < new_commit_index) {
+        uint64_t index = ++raft->commit_index;
+        const struct raft_entry *e = raft_get_entry(raft, index);
+        if (e->servers) {
+            raft_run_reconfigure(raft);
+        }
+        if (e->data) {
+            struct raft_command *cmd = raft_find_command_by_index(raft, index);
+            if (cmd) {
+                raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
+            }
+        }
+    }
+}
+
+/* This doesn't use rq->entries (but it does use rq->n_entries). */
+static void
+raft_send_append_reply(struct raft *raft, const struct raft_append_request *rq,
+                       enum raft_append_result result, const char *comment)
+{
+    /* Figure 3.1: "If leaderCommit > commitIndex, set commitIndex =
+     * min(leaderCommit, index of last new entry)" */
+    if (result == RAFT_APPEND_OK && rq->leader_commit > raft->commit_index) {
+        raft_update_commit_index(
+            raft, MIN(rq->leader_commit, rq->prev_log_index + rq->n_entries));
+    }
+
+    /* Send reply. */
+    union raft_rpc reply = {
+        .append_reply = {
+            .common = {
+                .type = RAFT_RPC_APPEND_REPLY,
+                .sid = rq->common.sid,
+                .comment = CONST_CAST(char *, comment),
+            },
+            .term = raft->current_term,
+            .log_end = raft->log_end,
+            .prev_log_index = rq->prev_log_index,
+            .prev_log_term = rq->prev_log_term,
+            .n_entries = rq->n_entries,
+            .result = result,
+        }
+    };
+    raft_send(raft, &reply);
+}
+
+/* If 'prev_log_index' exists in 'raft''s log, term 'prev_log_term', returns
+ * NULL.  Otherwise, returns an explanation for the mismatch.  */
+static const char *
+match_index_and_term(const struct raft *raft,
+                     uint64_t prev_log_index, uint64_t prev_log_term)
+{
+    if (prev_log_index < raft->log_start - 1) {
+        return "mismatch before start of log";
+    } else if (prev_log_index == raft->log_start - 1) {
+        if (prev_log_term != raft->snap.term) {
+            return "prev_term mismatch";
+        }
+    } else if (prev_log_index < raft->log_end) {
+        if (raft->entries[prev_log_index - raft->log_start].term
+            != prev_log_term) {
+            return "term mismatch";
+        }
+    } else {
+        /* prev_log_index >= raft->log_end */
+        return "mismatch past end of log";
+    }
+    return NULL;
+}
+
+/* Returns NULL on success, RAFT_IN_PROGRESS for an operation in progress,
+ * otherwise a brief comment explaining failure. */
+static void
+raft_handle_append_entries(struct raft *raft,
+                           const struct raft_append_request *rq,
+                           uint64_t prev_log_index, uint64_t prev_log_term,
+                           const struct raft_entry *entries,
+                           unsigned int n_entries)
+{
+    /* Section 3.5: "When sending an AppendEntries RPC, the leader includes
+     * the index and term of the entry in its log that immediately precedes
+     * the new entries. If the follower does not find an entry in its log
+     * with the same index and term, then it refuses the new entries." */
+    const char *mismatch = match_index_and_term(raft, prev_log_index,
+                                                prev_log_term);
+    if (mismatch) {
+        VLOG_INFO("rejecting append_request because previous entry "
+                  "%"PRIu64",%"PRIu64" not in local log (%s)",
+                  prev_log_term, prev_log_index, mismatch);
+        raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY, mismatch);
+        return;
+    }
+
+    /* Figure 3.1: "If an existing entry conflicts with a new one (same
+     * index but different terms), delete the existing entry and all that
+     * follow it." */
+    unsigned int i;
+    bool servers_changed = false;
+    for (i = 0; ; i++) {
+        if (i >= n_entries) {
+            /* No change. */
+            if (rq->common.comment
+                && !strcmp(rq->common.comment, "heartbeat")) {
+                raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "heartbeat");
+            } else {
+                raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change");
+            }
+            return;
+        }
+
+        uint64_t log_index = (prev_log_index + 1) + i;
+        if (log_index >= raft->log_end) {
+            break;
+        }
+        if (raft->entries[log_index - raft->log_start].term
+            != entries[i].term) {
+            if (raft_truncate(raft, log_index)) {
+                servers_changed = true;
+            }
+            break;
+        }
+    }
+
+    /* Figure 3.1: "Append any entries not already in the log." */
+    struct ovsdb_error *error = NULL;
+    for (; i < n_entries; i++) {
+        const struct raft_entry *e = &entries[i];
+        error = raft_write_entry(raft, e->term,
+                                 json_nullable_clone(e->data), &e->eid,
+                                 json_nullable_clone(e->servers));
+        if (error) {
+            break;
+        }
+        if (e->servers) {
+            servers_changed = true;
+        }
+    }
+
+    if (servers_changed) {
+        raft_get_servers_from_log(raft);
+    }
+
+    if (error) {
+        char *s = ovsdb_error_to_string_free(error);
+        VLOG_ERR("%s", s);
+        free(s);
+        raft_send_append_reply(raft, rq, RAFT_APPEND_IO_ERROR, "I/O error");
+        return;
+    }
+
+    struct raft_waiter *w = raft_waiter_create(raft, RAFT_W_APPEND);
+    w->append.rq = xmemdup(rq, sizeof *rq);
+    w->append.rq->entries = NULL;
+    /* Reply will be sent later following waiter completion. */
+}
+
+static bool
+raft_update_leader(struct raft *raft, const struct uuid *sid)
+{
+    if (raft->role == RAFT_LEADER && !uuid_equals(sid, &raft->sid)) {
+        VLOG_ERR("this server is leader but server "SID_FMT" claims to be",
+                 SID_ARGS(sid));
+        return false;
+    } else if (!uuid_equals(sid, &raft->leader_sid)) {
+        if (!uuid_is_zero(&raft->leader_sid)) {
+            VLOG_ERR("leader for term %"PRIu64" changed "
+                     "from "SID_FMT" to "SID_FMT,
+                     raft->current_term,
+                     SID_ARGS(&raft->leader_sid),
+                     SID_ARGS(sid));
+        } else {
+            VLOG_INFO("server "SID_FMT" is leader for term %"PRIu64,
+                      SID_ARGS(sid), raft->current_term);
+        }
+        raft->leader_sid = *sid;
+    }
+    return true;
+}
+
+static void
+raft_handle_append_request__(struct raft *raft,
+                             const struct raft_append_request *rq)
+{
+    if (!raft_receive_term__(raft, &rq->common, rq->term)) {
+        /* Section 3.3: "If a server receives a request with a stale term
+         * number, it rejects the request." */
+        raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY,
+                               "stale term");
+        return;
+    }
+
+    /* We do not check whether the server that sent the request is part of the
+     * cluster.  As section 4.1 says, "A server accepts AppendEntries requests
+     * from a leader that is not part of the server’s latest configuration.
+     * Otherwise, a new server could never be added to the cluster (it would
+     * never accept any log entries preceding the configuration entry that adds
+     * the server)." */
+    if (!raft_update_leader(raft, &rq->common.sid)) {
+        raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY,
+                               "usurped leadership");
+        return;
+    }
+    raft_reset_timer(raft);
+
+    /* First check for the common case, where the AppendEntries request is
+     * entirely for indexes covered by 'log_start' ... 'log_end - 1', something
+     * like this:
+     *
+     *     rq->prev_log_index
+     *       | first_entry_index
+     *       |   |         nth_entry_index
+     *       |   |           |
+     *       v   v           v
+     *         +---+---+---+---+
+     *       T | T | T | T | T |
+     *         +---+-------+---+
+     *     +---+---+---+---+
+     *   T | T | T | T | T |
+     *     +---+---+---+---+
+     *       ^               ^
+     *       |               |
+     *   log_start        log_end
+     * */
+    uint64_t first_entry_index = rq->prev_log_index + 1;
+    uint64_t nth_entry_index = rq->prev_log_index + rq->n_entries;
+    if (OVS_LIKELY(first_entry_index >= raft->log_start)) {
+        raft_handle_append_entries(raft, rq,
+                                   rq->prev_log_index, rq->prev_log_term,
+                                   rq->entries, rq->n_entries);
+        return;
+    }
+
+    /* Now a series of checks for odd cases, where the AppendEntries request
+     * extends earlier than the beginning of our log, into the log entries
+     * discarded by the most recent snapshot. */
+
+    /*
+     * Handle the case where the indexes covered by rq->entries[] are entirely
+     * disjoint with 'log_start - 1' ... 'log_end - 1', as shown below.  So,
+     * everything in the AppendEntries request must already have been
+     * committed, and we might as well return true.
+     *
+     *     rq->prev_log_index
+     *       | first_entry_index
+     *       |   |         nth_entry_index
+     *       |   |           |
+     *       v   v           v
+     *         +---+---+---+---+
+     *       T | T | T | T | T |
+     *         +---+-------+---+
+     *                             +---+---+---+---+
+     *                           T | T | T | T | T |
+     *                             +---+---+---+---+
+     *                               ^               ^
+     *                               |               |
+     *                           log_start        log_end
+     */
+    if (nth_entry_index < raft->log_start - 1) {
+        raft_send_append_reply(raft, rq, RAFT_APPEND_OK,
+                               "append before log start");
+        return;
+    }
+
+    /*
+     * Handle the case where the last entry in rq->entries[] has the same index
+     * as 'log_start - 1', so we can compare their terms:
+     *
+     *     rq->prev_log_index
+     *       | first_entry_index
+     *       |   |         nth_entry_index
+     *       |   |           |
+     *       v   v           v
+     *         +---+---+---+---+
+     *       T | T | T | T | T |
+     *         +---+-------+---+
+     *                         +---+---+---+---+
+     *                       T | T | T | T | T |
+     *                         +---+---+---+---+
+     *                           ^               ^
+     *                           |               |
+     *                       log_start        log_end
+     *
+     * There's actually a sub-case where rq->n_entries == 0, in which we
+     * compare rq->prev_term:
+     *
+     *     rq->prev_log_index
+     *       |
+     *       |
+     *       |
+     *       v
+     *       T
+     *
+     *         +---+---+---+---+
+     *       T | T | T | T | T |
+     *         +---+---+---+---+
+     *           ^               ^
+     *           |               |
+     *       log_start        log_end
+     */
+    if (nth_entry_index == raft->log_start - 1) {
+        if (rq->n_entries
+            ? raft->snap.term == rq->entries[rq->n_entries - 1].term
+            : raft->snap.term == rq->prev_log_term) {
+            raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change");
+        } else {
+            raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY,
+                                   "term mismatch");
+        }
+        return;
+    }
+
+    /*
+     * We now know that the data in rq->entries[] overlaps the data in
+     * raft->entries[], as shown below, with some positive 'ofs':
+     *
+     *     rq->prev_log_index
+     *       | first_entry_index
+     *       |   |             nth_entry_index
+     *       |   |               |
+     *       v   v               v
+     *         +---+---+---+---+---+
+     *       T | T | T | T | T | T |
+     *         +---+-------+---+---+
+     *                     +---+---+---+---+
+     *                   T | T | T | T | T |
+     *                     +---+---+---+---+
+     *                       ^               ^
+     *                       |               |
+     *                   log_start        log_end
+     *
+     *           |<-- ofs -->|
+     *
+     * We transform this into the following by trimming the first 'ofs'
+     * elements off of rq->entries[], ending up with the following.  Notice how
+     * we retain the term but not the data for rq->entries[ofs - 1]:
+     *
+     *                  first_entry_index + ofs - 1
+     *                   | first_entry_index + ofs
+     *                   |   |  nth_entry_index + ofs
+     *                   |   |   |
+     *                   v   v   v
+     *                     +---+---+
+     *                   T | T | T |
+     *                     +---+---+
+     *                     +---+---+---+---+
+     *                   T | T | T | T | T |
+     *                     +---+---+---+---+
+     *                       ^               ^
+     *                       |               |
+     *                   log_start        log_end
+     */
+    uint64_t ofs = raft->log_start - first_entry_index;
+    raft_handle_append_entries(raft, rq,
+                               raft->log_start - 1, rq->entries[ofs - 1].term,
+                               &rq->entries[ofs], rq->n_entries - ofs);
+}
+
+bool
+raft_has_next_entry(const struct raft *raft_)
+{
+    struct raft *raft = CONST_CAST(struct raft *, raft_);
+    struct uuid eid;
+    return raft_peek_next_entry(raft, &eid) != NULL;
+}
+
+const struct json *
+raft_next_entry(struct raft *raft, struct uuid *eid, bool *is_snapshot)
+{
+    const struct json *data = raft_get_next_entry(raft, eid);
+    *is_snapshot = data == raft->snap.data;
+    return data;
+}
+
+uint64_t
+raft_get_applied_index(const struct raft *raft)
+{
+    return raft->last_applied;
+}
+
+uint64_t
+raft_get_commit_index(const struct raft *raft)
+{
+    return raft->commit_index;
+}
+
+static void
+raft_handle_append_request(struct raft *raft,
+                           const struct raft_append_request *rq)
+{
+    raft_handle_append_request__(raft, rq);
+}
+
+static struct raft_server *
+raft_find_peer(struct raft *raft, const struct uuid *uuid)
+{
+    struct raft_server *s = raft_find_server(raft, uuid);
+    return s != raft->me ? s : NULL;
+}
+
+static struct raft_server *
+raft_find_new_server(struct raft *raft, const struct uuid *uuid)
+{
+    return raft_find_server__(&raft->add_servers, uuid);
+}
+
+/* Figure 3.1: "If there exists an N such that N > commitIndex, a
+ * majority of matchIndex[i] >= N, and log[N].term == currentTerm, set
+ * commitIndex = N (sections 3.5 and 3.6)." */
+static void
+raft_consider_updating_commit_index(struct raft *raft)
+{
+    /* This loop cannot just bail out when it comes across a log entry that
+     * does not match the criteria.  For example, Figure 3.7(d2) shows a
+     * case where the log entry for term 2 cannot be committed directly
+     * (because it is not for the current term) but it can be committed as
+     * a side effect of commit the entry for term 4 (the current term).
+     * XXX Is there a more efficient way to do this? */
+    ovs_assert(raft->role == RAFT_LEADER);
+    for (uint64_t n = MAX(raft->commit_index + 1, raft->log_start);
+         n < raft->log_end; n++) {
+        if (raft->entries[n - raft->log_start].term == raft->current_term) {
+            size_t count = 0;
+            struct raft_server *s2;
+            HMAP_FOR_EACH (s2, hmap_node, &raft->servers) {
+                if (s2->match_index >= n) {
+                    count++;
+                }
+            }
+            if (count > hmap_count(&raft->servers) / 2) {
+                VLOG_INFO("index %"PRIu64" committed to %"PRIuSIZE" servers, "
+                          "applying", n, count);
+                raft_update_commit_index(raft, n);
+            }
+        }
+    }
+}
+
+static void
+raft_update_match_index(struct raft *raft, struct raft_server *s,
+                        uint64_t min_index)
+{
+    ovs_assert(raft->role == RAFT_LEADER);
+    if (min_index > s->match_index) {
+        s->match_index = min_index;
+        raft_consider_updating_commit_index(raft);
+    }
+}
+
+static void
+raft_send_install_snapshot_request(struct raft *raft,
+                                   const struct raft_server *s,
+                                   const char *comment)
+{
+    union raft_rpc rpc = {
+        .install_snapshot_request = {
+            .common = {
+                .type = RAFT_RPC_INSTALL_SNAPSHOT_REQUEST,
+                .sid = s->sid,
+                .comment = CONST_CAST(char *, comment),
+            },
+            .term = raft->current_term,
+            .last_index = raft->log_start - 1,
+            .last_term = raft->snap.term,
+            .last_servers = raft->snap.servers,
+            .last_eid = raft->snap.eid,
+            .data = raft->snap.data,
+        }
+    };
+    raft_send(raft, &rpc);
+}
+
+static void
+raft_handle_append_reply(struct raft *raft,
+                         const struct raft_append_reply *rpy)
+{
+    if (!raft_receive_term__(raft, &rpy->common, rpy->term)) {
+        return;
+    }
+    if (raft->role != RAFT_LEADER) {
+        VLOG_INFO("rejected append_reply (not leader)");
+        return;
+    }
+
+    /* Most commonly we'd be getting an AppendEntries reply from a configured
+     * server (e.g. a peer), but we can also get them from servers in the
+     * process of being added. */
+    struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
+    if (!s) {
+        s = raft_find_new_server(raft, &rpy->common.sid);
+        if (!s) {
+            VLOG_INFO("rejected append_reply from unknown server "SID_FMT,
+                      SID_ARGS(&rpy->common.sid));
+            return;
+        }
+    }
+
+    if (rpy->result == RAFT_APPEND_OK) {
+        /* Figure 3.1: "If successful, update nextIndex and matchIndex for
+         * follower (section 3.5)." */
+        uint64_t min_index = rpy->prev_log_index + rpy->n_entries + 1;
+        if (s->next_index < min_index) {
+            s->next_index = min_index;
+        }
+        raft_update_match_index(raft, s, min_index - 1);
+    } else {
+        /* Figure 3.1: "If AppendEntries fails because of log inconsistency,
+         * decrement nextIndex and retry (section 3.5)."
+         *
+         * We also implement the optimization suggested in section 4.2.1:
+         * "Various approaches can make nextIndex converge to its correct value
+         * more quickly, including those described in Chapter 3. The simplest
+         * approach to solving this particular problem of adding a new server,
+         * however, is to have followers return the length of their logs in the
+         * AppendEntries response; this allows the leader to cap the follower’s
+         * nextIndex accordingly." */
+        if (s->next_index > 0) {
+            s->next_index = MIN(s->next_index - 1, rpy->log_end);
+        } else {
+            /* XXX log */
+            VLOG_INFO("XXX");
+        }
+
+        if (rpy->result == RAFT_APPEND_IO_ERROR) {
+            /* Append failed but not because of a log inconsistency.  Because
+             * of the I/O error, there's no point in re-sending the append
+             * immediately.
+             *
+             * XXX We should fail the command if enough I/O errors occur that
+             * we can't get a majority. */
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+            VLOG_INFO_RL(&rl, SID_FMT" reported I/O error",
+                         SID_ARGS(&s->sid));
+            return;
+        }
+    }
+
+    /*
+     * Our behavior here must depend on the value of next_index relative to
+     * log_start and log_end.  There are three cases:
+     *
+     *        Case 1       |    Case 2     |      Case 3
+     *   <---------------->|<------------->|<------------------>
+     *                     |               |
+     *
+     *                     +---+---+---+---+
+     *                   T | T | T | T | T |
+     *                     +---+---+---+---+
+     *                       ^               ^
+     *                       |               |
+     *                   log_start        log_end
+     */
+    if (s->next_index < raft->log_start) {
+        /* Case 1. */
+        raft_send_install_snapshot_request(raft, s, NULL);
+    } else if (s->next_index < raft->log_end) {
+        /* Case 2. */
+        raft_send_append_request(raft, s, 1, NULL);
+    } else {
+        /* Case 3. */
+        if (s->phase == RAFT_PHASE_CATCHUP) {
+            s->phase = RAFT_PHASE_CAUGHT_UP;
+            raft_run_reconfigure(raft);
+        }
+    }
+}
+
+/* Returns true if a reply should be sent. */
+static bool
+raft_handle_vote_request__(struct raft *raft,
+                           const struct raft_vote_request *rq)
+{
+    /* Section 4.2.3 "Disruptive Servers" says:
+     *
+     *    ...if a server receives a RequestVote request within the minimum
+     *    election timeout of hearing from a current leader, it does not update
+     *    its term or grant its vote...
+     *
+     *    ...This change conflicts with the leadership transfer mechanism as
+     *    described in Chapter 3, in which a server legitimately starts an
+     *    election without waiting an election timeout.  In that case,
+     *    RequestVote messages should be processed by other servers even when
+     *    they believe a current cluster leader exists.  Those RequestVote
+     *    requests can include a special flag to indicate this behavior (“I
+     *    have permission to disrupt the leader—it told me to!”).
+     *
+     * XXX This clearly describes how the followers should act, but not the
+     * leader.  We just ignore vote requests that arrive at a current leader.
+     * Is this safe? */
+    if (!rq->leadership_transfer) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        if (raft->role == RAFT_LEADER) {
+            VLOG_WARN_RL(&rl, "ignoring vote request received as leader");
+            return false;
+        }
+
+        long long int now = time_msec();
+        if (now < raft->election_base + ELECTION_BASE_MSEC) {
+            VLOG_WARN_RL(&rl, "ignoring vote request received after only "
+                         "%lld ms (minimum election time is %d ms)",
+                         now - raft->election_base, ELECTION_BASE_MSEC);
+            return false;
+        }
+    }
+
+    if (!raft_receive_term__(raft, &rq->common, rq->term)) {
+        return true;
+    }
+
+    /* If we're waiting for our vote to be recorded persistently, don't
+     * respond. */
+    const struct raft_waiter *w;
+    LIST_FOR_EACH (w, list_node, &raft->waiters) {
+        if (w->type == RAFT_W_VOTE) {
+            return false;
+        }
+    }
+
+    /* Figure 3.1: "If votedFor is null or candidateId, and candidate's vote is
+     * at least as up-to-date as receiver's log, grant vote (sections 3.4,
+     * 3.6)." */
+    if (uuid_equals(&raft->voted_for, &rq->common.sid)) {
+        /* Already voted for this candidate in this term.  Resend vote. */
+        return true;
+    } else if (!uuid_is_zero(&raft->voted_for)) {
+        /* Already voted for different candidate in this term.  Send a reply
+         * saying what candidate we did vote for.  This isn't a necessary part
+         * of the Raft protocol but it can make debugging easier. */
+        return true;
+    }
+
+    /* Section 3.6.1: "The RequestVote RPC implements this restriction: the RPC
+     * includes information about the candidate’s log, and the voter denies its
+     * vote if its own log is more up-to-date than that of the candidate.  Raft
+     * determines which of two logs is more up-to-date by comparing the index
+     * and term of the last entries in the logs.  If the logs have last entries
+     * with different terms, then the log with the later term is more
+     * up-to-date.  If the logs end with the same term, then whichever log is
+     * longer is more up-to-date." */
+    uint64_t last_term = (raft->log_end > raft->log_start
+                          ? raft->entries[raft->log_end - 1
+                                          - raft->log_start].term
+                          : raft->snap.term);
+    if (last_term > rq->last_log_term
+        || (last_term == rq->last_log_term
+            && raft->log_end - 1 > rq->last_log_index)) {
+        /* Our log is more up-to-date than the peer's.   Withhold vote. */
+        return false;
+    }
+
+    /* Record a vote for the peer. */
+    raft->voted_for = rq->common.sid;
+    struct ovsdb_error *error = raft_write_state(raft->log,
+                                                 raft->current_term,
+                                                 &raft->voted_for);
+    if (error) {
+        /* XXX */
+    }
+
+    raft_reset_timer(raft);
+
+    raft_waiter_create(raft, RAFT_W_VOTE);
+    return false;
+}
+
+static void
+raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
+                     const struct uuid *vote)
+{
+    union raft_rpc rpy = {
+        .vote_reply = {
+            .common = {
+                .type = RAFT_RPC_VOTE_REPLY,
+                .sid = *dst,
+            },
+            .term = raft->current_term,
+            .vote = *vote,
+        },
+    };
+    raft_send(raft, &rpy);
+}
+
+static void
+raft_handle_vote_request(struct raft *raft,
+                         const struct raft_vote_request *rq)
+{
+    if (raft_handle_vote_request__(raft, rq)) {
+        raft_send_vote_reply(raft, &rq->common.sid, &raft->voted_for);
+    }
+}
+
+static void
+raft_handle_vote_reply(struct raft *raft,
+                       const struct raft_vote_reply *rpy)
+{
+    if (!raft_receive_term__(raft, &rpy->common, rpy->term)) {
+        return;
+    }
+
+    if (raft->role != RAFT_CANDIDATE) {
+        return;
+    }
+
+    struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
+    if (s) {
+        raft_accept_vote(raft, s, &rpy->vote);
+    }
+}
+
+/* Returns true if 'raft''s log contains reconfiguration entries that have not
+ * yet been committed. */
+static bool
+raft_has_uncommitted_configuration(const struct raft *raft)
+{
+    for (uint64_t i = raft->commit_index + 1; i < raft->log_end; i++) {
+        ovs_assert(i >= raft->log_start);
+        const struct raft_entry *e = &raft->entries[i - raft->log_start];
+        if (e->servers) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void
+raft_log_reconfiguration(struct raft *raft)
+{
+    /* Add the reconfiguration to the log. */
+    struct json *servers_json = raft_servers_to_json(&raft->servers);
+    struct raft_command *cmd = raft_command_execute__(
+        raft, NULL, servers_json, NULL, NULL);
+    json_destroy(servers_json);
+    if (cmd) {
+        /* XXX handle error */
+    }
+}
+
+static void
+raft_run_reconfigure(struct raft *raft)
+{
+    ovs_assert(raft->role == RAFT_LEADER);
+
+    /* Reconfiguration only progresses when configuration changes commit. */
+    if (raft_has_uncommitted_configuration(raft)) {
+        return;
+    }
+
+    /* If we were waiting for a configuration change to commit, it's done. */
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s->phase == RAFT_PHASE_COMMITTING) {
+            raft_send_add_server_reply__(raft, &s->sid, s->address,
+                                         true, NULL);
+            s->phase = RAFT_PHASE_STABLE;
+        }
+    }
+    if (raft->remove_server) {
+        raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
+                                        &raft->remove_server->requester_sid,
+                                        raft->remove_server->requester_conn,
+                                        true, NULL);
+        raft_server_destroy(raft->remove_server);
+        raft->remove_server = NULL;
+    }
+
+    /* If a new server is caught up, add it to the configuration.  */
+    HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
+        if (s->phase == RAFT_PHASE_CAUGHT_UP) {
+            /* Move 's' from 'raft->add_servers' to 'raft->servers'. */
+            hmap_remove(&raft->add_servers, &s->hmap_node);
+            hmap_insert(&raft->servers, &s->hmap_node, uuid_hash(&s->sid));
+
+            /* Mark 's' as waiting for commit. */
+            s->phase = RAFT_PHASE_COMMITTING;
+
+            raft_log_reconfiguration(raft);
+
+            /* When commit completes we'll transition to RAFT_PHASE_STABLE and
+             * send a RAFT_SERVER_OK reply. */
+
+            return;
+        }
+    }
+
+    /* Remove a server, if one is scheduled for removal. */
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s->phase == RAFT_PHASE_REMOVE) {
+            hmap_remove(&raft->servers, &s->hmap_node);
+            raft->remove_server = s;
+
+            raft_log_reconfiguration(raft);
+
+            return;
+        }
+    }
+}
+
+static void
+raft_handle_add_server_request(struct raft *raft,
+                               const struct raft_add_server_request *rq)
+{
+    /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */
+    if (raft->role != RAFT_LEADER) {
+        raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER);
+        return;
+    }
+
+    /* Check for an existing server. */
+    struct raft_server *s = raft_find_server(raft, &rq->common.sid);
+    if (s) {
+        /* If the server is scheduled to be removed, cancel it. */
+        if (s->phase == RAFT_PHASE_REMOVE) {
+            s->phase = RAFT_PHASE_STABLE;
+            raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_CANCELED);
+            return;
+        }
+
+        /* If the server is being added, then it's in progress. */
+        if (s->phase != RAFT_PHASE_STABLE) {
+            raft_send_add_server_reply(raft, rq,
+                                       false, RAFT_SERVER_IN_PROGRESS);
+        }
+
+        /* Nothing to do--server is already part of the configuration. */
+        raft_send_add_server_reply(raft, rq,
+                                   true, RAFT_SERVER_ALREADY_PRESENT);
+        return;
+    }
+
+    /* Check for a server being removed. */
+    if (raft->remove_server
+        && uuid_equals(&rq->common.sid, &raft->remove_server->sid)) {
+        raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING);
+        return;
+    }
+
+    /* Check for a server already being added. */
+    if (raft_find_new_server(raft, &rq->common.sid)) {
+        raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_IN_PROGRESS);
+        return;
+    }
+
+    /* Add server to 'add_servers'. */
+    s = xzalloc(sizeof *s);
+    hmap_insert(&raft->add_servers, &s->hmap_node, uuid_hash(&rq->common.sid));
+    s->sid = rq->common.sid;
+    raft_server_init_leader(raft, s);
+    s->address = xstrdup(rq->address);
+    s->requester_sid = rq->common.sid;
+    s->requester_conn = NULL;
+    s->phase = RAFT_PHASE_CATCHUP;
+
+    /* Start sending the log.  If this is the first time we've tried to add
+     * this server, then this will quickly degenerate into an InstallSnapshot
+     * followed by a series of AddEntries, but if it's a retry of an earlier
+     * AddRequest that was interrupted (e.g. by a timeout or a loss of
+     * leadership) then it will gracefully resume populating the log.
+     *
+     * See the last few paragraphs of section 4.2.1 for further insight. */
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+    VLOG_INFO_RL(&rl,
+                 "starting to add server "SID_FMT" (%s) to cluster "CID_FMT,
+                 SID_ARGS(&s->sid), rq->address,
+                 CID_ARGS(&raft->cid));
+    raft_send_append_request(raft, s, 0, "initialize new server");
+
+    /* Reply will be sent later following waiter completion. */
+}
+
+static void
+raft_handle_add_server_reply(struct raft *raft,
+                             const struct raft_add_server_reply *rpy)
+{
+    if (!raft->joining) {
+        VLOG_WARN("received add_server_reply even though we're already "
+                  "part of the cluster");
+        return;
+    }
+
+    if (rpy->success) {
+        if (raft->me) {
+            raft->joining = false;
+
+            /* Close outgoing connections not known to be to a server in the
+             * cluster.  */
+            struct raft_conn *conn, *next;
+            LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) {
+                if (conn->incoming && !raft_find_server(raft, &conn->sid)) {
+                    ovs_list_remove(&conn->list_node);
+                    jsonrpc_session_close(conn->js);
+                    free(conn);
+                }
+            }
+        } else {
+            /* XXX we're not really part of the cluster? */
+        }
+    }
+}
+
+/* This is called by raft_unixctl_kick() as well as via RPC. */
+static void
+raft_handle_remove_server_request(struct raft *raft,
+                                  const struct raft_remove_server_request *rq)
+{
+    /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */
+    if (raft->role != RAFT_LEADER) {
+        raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER);
+        return;
+    }
+
+    /* If the server to remove is currently waiting to be added, cancel it. */
+    struct raft_server *target = raft_find_new_server(raft, &rq->sid);
+    if (target) {
+        raft_send_add_server_reply__(raft, &target->sid, target->address,
+                                     false, RAFT_SERVER_CANCELED);
+        hmap_remove(&raft->add_servers, &target->hmap_node);
+        raft_server_destroy(target);
+        return;
+    }
+
+    /* If the server isn't configured, report that. */
+    target = raft_find_server(raft, &rq->sid);
+    if (!target) {
+        raft_send_remove_server_reply(raft, rq,
+                                      true, RAFT_SERVER_ALREADY_GONE);
+        return;
+    }
+
+    /* Check whether we're waiting for the addition of the server to commit. */
+    if (target->phase == RAFT_PHASE_COMMITTING) {
+        raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING);
+        return;
+    }
+
+    /* Check whether the server is already scheduled for removal. */
+    if (target->phase == RAFT_PHASE_REMOVE) {
+        raft_send_remove_server_reply(raft, rq,
+                                      false, RAFT_SERVER_IN_PROGRESS);
+        return;
+    }
+
+    /* Make sure that if we remove this server then that at least one other
+     * server will be left.  We don't count servers currently being added (in
+     * 'add_servers') since those could fail. */
+    struct raft_server *s;
+    int n = 0;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        if (s != target && s->phase != RAFT_PHASE_REMOVE) {
+            n++;
+        }
+    }
+    if (!n) {
+        raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_EMPTY);
+        return;
+    }
+
+    /* Mark the server for removal. */
+    target->phase = RAFT_PHASE_REMOVE;
+    if (rq->requester_conn) {
+        target->requester_sid = UUID_ZERO;
+        unixctl_command_reply(rq->requester_conn, "started removal");
+    } else {
+        target->requester_sid = rq->common.sid;
+        target->requester_conn = NULL;
+    }
+
+    raft_run_reconfigure(raft);
+    /* Operation in progress, reply will be sent later. */
+}
+
+static void
+raft_handle_remove_server_reply(struct raft *raft,
+                                const struct raft_remove_server_reply *rpc)
+{
+    if (rpc->success) {
+        VLOG_INFO("%04x: finished leaving cluster %04x",
+                  uuid_prefix(&raft->sid, 4), uuid_prefix(&raft->cid, 4));
+
+        /* Write a sentinel to prevent the cluster from restarting.  The
+         * cluster should be resilient against such an occurrence in any case,
+         * but this allows for better error messages. */
+        struct json *json = json_object_create();
+        json_object_put(json, "left", json_boolean_create(true));
+        struct ovsdb_error *error = ovsdb_log_write(raft->log, json);
+        if (error) {
+            char *error_s = ovsdb_error_to_string_free(error);
+            VLOG_WARN("error writing sentinel record (%s)", error_s);
+            free(error_s);
+        }
+        json_destroy(json);
+
+        raft->leaving = false;
+        raft->left = true;
+    }
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_write_snapshot(struct raft *raft, struct ovsdb_log *log,
+                    uint64_t new_log_start, const struct json *new_snapshot)
+{
+    ovs_assert(new_log_start >= raft->log_start);
+    ovs_assert(new_log_start <= raft->log_end);
+    ovs_assert(new_log_start <= raft->last_applied + 2);
+    ovs_assert(new_log_start > raft->log_start
+               ? new_snapshot != NULL
+               : new_snapshot == NULL);
+
+    /* Compose header record. */
+    uint64_t prev_term = raft_get_term(raft, new_log_start - 1);
+    const struct uuid *prev_eid = raft_get_eid(raft, new_log_start - 1);
+    uint64_t prev_index = new_log_start - 1;
+    struct json *prev_servers = raft_servers_for_index(raft,
+                                                       new_log_start - 1);
+
+    /* Write snapshot record. */
+    struct json *header = json_object_create();
+    json_object_put_format(header, "server_id", UUID_FMT,
+                           UUID_ARGS(&raft->sid));
+    json_object_put_string(header, "name", raft->name);
+    json_object_put_string(header, "local_address", raft->local_address);
+    json_object_put(header, "prev_servers", prev_servers);
+    if (!uuid_is_zero(&raft->cid)) {
+        json_object_put_format(header, "cluster_id", UUID_FMT,
+                               UUID_ARGS(&raft->cid));
+    }
+    if (raft->snap.data || new_snapshot) {
+        json_object_put(header, "prev_term", json_integer_create(prev_term));
+        json_object_put(header, "prev_index",
+                        json_integer_create(prev_index));
+        json_object_put(header, "prev_data", json_clone(new_snapshot
+                                                        ? new_snapshot
+                                                        : raft->snap.data));
+        json_object_put_format(header, "prev_eid",
+                               UUID_FMT, UUID_ARGS(prev_eid));
+    }
+    struct ovsdb_error *error = ovsdb_log_write(log, header);
+    json_destroy(header);
+    if (error) {
+        return error;
+    }
+    ovsdb_log_mark_base(raft->log);
+
+    /* Write log records. */
+    for (uint64_t index = new_log_start; index < raft->log_end; index++) {
+        struct json *json = raft_entry_to_json_with_index(raft, index);
+        error = ovsdb_log_write(log, json);
+        json_destroy(json);
+        if (error) {
+            return error;
+        }
+    }
+
+    /* Write term and vote (if any).
+     *
+     * The term is redundant if we wrote a log record for that term above.  The
+     * vote, if any, is never redundant.
+     */
+    return raft_write_state(log, raft->current_term, &raft->voted_for);
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_save_snapshot(struct raft *raft,
+                   uint64_t new_start, const struct json *new_snapshot)
+{
+    struct ovsdb_log *new_log;
+    struct ovsdb_error *error;
+    error = ovsdb_log_replace_start(raft->log, &new_log);
+    if (error) {
+        return error;
+    }
+
+    error = raft_write_snapshot(raft, new_log, new_start, new_snapshot);
+    if (error) {
+        ovsdb_log_replace_abort(new_log);
+        return error;
+    }
+
+    return ovsdb_log_replace_commit(raft->log, new_log);
+}
+
+static void
+raft_handle_install_snapshot_request__(
+    struct raft *raft, const struct raft_install_snapshot_request *rq)
+{
+    if (!raft_receive_term__(raft, &rq->common, rq->term)) {
+        return;
+    }
+
+    raft_reset_timer(raft);
+
+    uint64_t new_log_start = rq->last_index + 1;
+    if (new_log_start < raft->log_start) {
+        /* The new snapshot covers less than our current one, why bother? */
+        return;
+    } else if (new_log_start >= raft->log_end) {
+        /* The new snapshot starts past the end of our current log, so discard
+         * all of our current log.
+         *
+         * XXX make sure that last_term is not a regression*/
+        raft->log_start = raft->log_end = new_log_start;
+    } else {
+        /* The new snapshot starts in the middle of our log, so discard the
+         * first 'new_log_start - raft->log_start' entries in the log.
+         *
+         * XXX we can validate last_term and last_servers exactly */
+        memmove(&raft->entries[0],
+                &raft->entries[new_log_start - raft->log_start],
+                (raft->log_end - new_log_start) * sizeof *raft->entries);
+        raft->log_start = new_log_start;
+    }
+    raft->commit_index = raft->log_start - 1;
+    if (raft->last_applied < raft->commit_index) {
+        raft->last_applied = raft->log_start - 2;
+    }
+
+    raft->snap.term = rq->last_term;
+    json_destroy(raft->snap.servers);
+    raft->snap.servers = json_clone(rq->last_servers);
+    raft->snap.eid = rq->last_eid;
+
+    /* install snapshot */
+    json_destroy(raft->snap.data);
+    raft->snap.data = json_clone(rq->data);
+
+    struct ovsdb_error *error = raft_save_snapshot(raft,
+                                                   raft->log_start, NULL);
+    if (error) {
+        char *error_s = ovsdb_error_to_string(error);
+        VLOG_WARN("could not save snapshot: %s", error_s);
+        free(error_s);
+
+        /* XXX handle error */
+    }
+}
+
+static void
+raft_handle_install_snapshot_request(
+    struct raft *raft, const struct raft_install_snapshot_request *rq)
+{
+    raft_handle_install_snapshot_request__(raft, rq);
+
+    union raft_rpc rpy = {
+        .install_snapshot_reply = {
+            .common = {
+                .type = RAFT_RPC_INSTALL_SNAPSHOT_REPLY,
+                .sid = rq->common.sid,
+            },
+            .term = raft->current_term,
+            .last_index = rq->last_index,
+            .last_term = rq->last_term,
+        },
+    };
+    raft_send(raft, &rpy);
+}
+
+static void
+raft_handle_install_snapshot_reply(
+    struct raft *raft, const struct raft_install_snapshot_reply *rpy)
+{
+    if (!raft_receive_term__(raft, &rpy->common, rpy->term)) {
+        return;
+    }
+
+    /* We might get an InstallSnapshot reply from a configured server (e.g. a
+     * peer) or a server in the process of being added. */
+    struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
+    if (!s) {
+        s = raft_find_new_server(raft, &rpy->common.sid);
+        if (!s) {
+            /* XXX log */
+            return;
+        }
+    }
+
+    if (rpy->last_index != raft->log_start - 1 ||
+        rpy->last_term != raft->snap.term) {
+        VLOG_INFO("cluster "CID_FMT": server "SID_FMT" installed "
+                  "out-of-date snapshot, starting over",
+                  CID_ARGS(&raft->cid), SID_ARGS(&s->sid));
+        raft_send_install_snapshot_request(raft, s,
+                                           "installed obsolete snapshot");
+        return;
+    }
+
+    VLOG_INFO("cluster "CID_FMT": installed snapshot on server "SID_FMT" "
+              "up to %"PRIu64":%"PRIu64,
+              CID_ARGS(&raft->cid), SID_ARGS(&s->sid),
+              rpy->last_term, rpy->last_index);
+    s->next_index = raft->log_end;
+    raft_send_append_request(raft, s, 0, "snapshot installed");
+}
+
+bool
+raft_has_grown(const struct raft *raft)
+{
+    return (!raft->joining
+            && !raft->leaving
+            && !raft->left
+            && raft->last_applied - raft->log_start >= 100
+            && ovsdb_log_has_grown(raft->log));
+}
+
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+raft_store_snapshot(struct raft *raft, const struct json *new_snapshot)
+{
+    if (raft->joining) {
+        return ovsdb_error(NULL,
+                           "cannot store a snapshot while joining cluster");
+    } else if (raft->leaving) {
+        return ovsdb_error(NULL,
+                           "cannot store a snapshot while leaving cluster");
+    } else if (raft->left) {
+        return ovsdb_error(NULL,
+                           "cannot store a snapshot after leaving cluster");
+    }
+
+    if (raft->last_applied < raft->log_start) {
+        return ovsdb_error(NULL, "not storing a duplicate snapshot");
+    }
+
+    uint64_t new_log_start = raft->last_applied + 1;
+    struct ovsdb_error *error = raft_save_snapshot(raft, new_log_start,
+                                                   new_snapshot);
+    if (error) {
+        return error;
+    }
+
+    raft->snap.term = raft_get_term(raft, new_log_start - 1);
+    json_destroy(raft->snap.data);
+    raft->snap.data = json_clone(new_snapshot);
+
+    json_destroy(raft->snap.servers);
+    raft->snap.servers = raft_servers_for_index(raft, new_log_start - 1);
+
+    memmove(&raft->entries[0], &raft->entries[new_log_start - raft->log_start],
+            (raft->log_end - new_log_start) * sizeof *raft->entries);
+    raft->log_start = new_log_start;
+    return NULL;
+}
+
+static void
+raft_handle_become_leader(struct raft *raft,
+                          const struct raft_become_leader *rq)
+{
+    if (!raft_receive_term__(raft, &rq->common, rq->term)) {
+        return;
+    }
+
+    if (raft->role == RAFT_FOLLOWER) {
+        VLOG_INFO("received leadership transfer from "SID_FMT
+                  " in term %"PRIu64,
+                  SID_ARGS(&rq->common.sid), rq->term);
+        raft_start_election(raft, true);
+    }
+}
+
+static void
+raft_send_execute_command_reply(struct raft *raft,
+                                const struct uuid *sid,
+                                const struct uuid *eid,
+                                enum raft_command_status status)
+{
+    union raft_rpc rpc = {
+        .execute_command_reply = {
+            .common = {
+                .type = RAFT_RPC_EXECUTE_COMMAND_REPLY,
+                .sid = *sid,
+            },
+            .result = *eid,
+            .status = status,
+        },
+    };
+    raft_send(raft, &rpc);
+}
+
+static enum raft_command_status
+raft_handle_execute_command_request__(
+    struct raft *raft, const struct raft_execute_command_request *rq)
+{
+    if (raft->role != RAFT_LEADER) {
+        return RAFT_CMD_NOT_LEADER;
+    }
+
+    const struct uuid *current_eid = raft_current_eid(raft);
+    if (!uuid_equals(&rq->prereq, current_eid)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match "
+                     "prerequisite "UUID_FMT" in execute_command_request",
+                     UUID_ARGS(current_eid), UUID_ARGS(&rq->prereq));
+        return RAFT_CMD_BAD_PREREQ;
+    }
+
+    struct raft_command *cmd = raft_command_initiate(raft, rq->data,
+                                                     NULL, &rq->result);
+    cmd->sid = rq->common.sid;
+
+    enum raft_command_status status = cmd->status;
+    if (status != RAFT_CMD_INCOMPLETE) {
+        raft_command_unref(cmd);
+    }
+    return status;
+}
+
+static void
+raft_handle_execute_command_request(
+    struct raft *raft, const struct raft_execute_command_request *rq)
+{
+    enum raft_command_status status
+        = raft_handle_execute_command_request__(raft, rq);
+    if (status != RAFT_CMD_INCOMPLETE) {
+        raft_send_execute_command_reply(raft, &rq->common.sid, &rq->result,
+                                        status);
+    }
+}
+
+static void
+raft_handle_execute_command_reply(
+    struct raft *raft, const struct raft_execute_command_reply *rpy)
+{
+    struct raft_command *cmd = raft_find_command_by_eid(raft, &rpy->result);
+    if (!cmd) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+        VLOG_INFO_RL(&rl,
+                     SID_FMT" received \"%s\" reply from "SID_FMT" "
+                     "for unknown command",
+                     SID_ARGS(&raft->sid),
+                     raft_command_status_to_string(rpy->status),
+                     SID_ARGS(&rpy->common.sid));
+        return;
+    }
+
+    if (rpy->status == RAFT_CMD_INCOMPLETE) {
+        cmd->timestamp = time_msec();
+    } else {
+        raft_command_complete(raft, cmd, rpy->status);
+    }
+}
+
+static void
+raft_handle_rpc(struct raft *raft, const union raft_rpc *rpc)
+{
+    switch (rpc->common.type) {
+#define RAFT_RPC(ENUM, NAME)                    \
+        case ENUM:                              \
+        raft_handle_##NAME(raft, &rpc->NAME);   \
+        break;
+    RAFT_RPC_TYPES
+#undef RAFT_RPC
+    default:
+        OVS_NOT_REACHED();
+    }
+}
+
+static bool
+raft_rpc_is_heartbeat(const union raft_rpc *rpc)
+{
+    return ((rpc->common.type == RAFT_RPC_APPEND_REQUEST
+             || rpc->common.type == RAFT_RPC_APPEND_REPLY)
+             && rpc->common.comment
+             && !strcmp(rpc->common.comment, "heartbeat"));
+}
+
+
+static bool
+raft_send__(struct raft *raft, const union raft_rpc *rpc,
+            struct jsonrpc_session *js)
+{
+    log_rpc(raft, rpc, "-->");
+    return !jsonrpc_session_send(
+        js, raft_rpc_to_jsonrpc(&raft->cid, &raft->sid, rpc));
+}
+
+static bool
+raft_send(struct raft *raft, const union raft_rpc *rpc)
+{
+    if (uuid_equals(&rpc->common.sid, &raft->sid)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+        VLOG_WARN_RL(&rl, "attempting to send RPC to self");
+        return false;
+    }
+
+    struct raft_conn *conn;
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
+        if (uuid_equals(&conn->sid, &rpc->common.sid)
+            && jsonrpc_session_is_connected(conn->js)) {
+            return raft_send__(raft, rpc, conn->js);
+        }
+    }
+
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+    VLOG_DBG_RL(&rl, SID_FMT": no connection to "SID_FMT", cannot send RPC",
+                SID_ARGS(&raft->sid),
+                SID_ARGS(&rpc->common.sid));
+    return false;
+}
+
+static struct raft *
+raft_lookup_by_name(const char *name)
+{
+    struct raft *raft;
+
+    HMAP_FOR_EACH_WITH_HASH (raft, hmap_node, hash_string(name, 0),
+                             &all_rafts) {
+        if (!strcmp(raft->name, name)) {
+            return raft;
+        }
+    }
+    return NULL;
+}
+
+static void
+raft_unixctl_cid(struct unixctl_conn *conn,
+                 int argc OVS_UNUSED, const char *argv[],
+                 void *aux OVS_UNUSED)
+{
+    struct raft *raft = raft_lookup_by_name(argv[1]);
+    if (!raft) {
+        unixctl_command_reply_error(conn, "unknown cluster");
+    } else if (uuid_is_zero(&raft->cid)) {
+        unixctl_command_reply_error(conn, "cluster id not yet known");
+    } else {
+        char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->cid));
+        unixctl_command_reply(conn, uuid);
+        free(uuid);
+    }
+}
+
+static void
+raft_unixctl_sid(struct unixctl_conn *conn,
+                 int argc OVS_UNUSED, const char *argv[],
+                 void *aux OVS_UNUSED)
+{
+    struct raft *raft = raft_lookup_by_name(argv[1]);
+    if (!raft) {
+        unixctl_command_reply_error(conn, "unknown cluster");
+    } else {
+        char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->sid));
+        unixctl_command_reply(conn, uuid);
+        free(uuid);
+    }
+}
+
+static void
+raft_put_sid(const char *title, const struct uuid *sid,
+             const struct raft *raft, struct ds *s)
+{
+    ds_put_format(s, "%s: ", title);
+    if (uuid_equals(sid, &raft->sid)) {
+        ds_put_cstr(s, "self");
+    } else if (uuid_is_zero(sid)) {
+        ds_put_cstr(s, "unknown");
+    } else {
+        ds_put_format(s, SID_FMT, SID_ARGS(sid));
+    }
+    ds_put_char(s, '\n');
+}
+
+static void
+raft_unixctl_status(struct unixctl_conn *conn,
+                    int argc OVS_UNUSED, const char *argv[],
+                    void *aux OVS_UNUSED)
+{
+    struct raft *raft = raft_lookup_by_name(argv[1]);
+    if (!raft) {
+        unixctl_command_reply_error(conn, "unknown cluster");
+        return;
+    }
+
+    struct ds s = DS_EMPTY_INITIALIZER;
+    ds_put_format(&s, SID_FMT"\n", SID_ARGS(&raft->sid));
+    ds_put_format(&s, "Name: %s\n", raft->name);
+    ds_put_format(&s, "Cluster ID: ");
+    if (!uuid_is_zero(&raft->cid)) {
+        ds_put_format(&s, UUID_FMT"\n", UUID_ARGS(&raft->cid));
+    } else {
+        ds_put_format(&s, "not yet known\n");
+    }
+    ds_put_format(&s, "Server ID: "SID_FMT" ("UUID_FMT")\n",
+                  SID_ARGS(&raft->sid), UUID_ARGS(&raft->sid));
+    ds_put_format(&s, "Address: %s\n", raft->local_address);
+    ds_put_format(&s, "Status: %s\n",
+                  raft->joining ? "joining cluster"
+                  : raft->leaving ? "leaving cluster"
+                  : raft->left ? "left cluster"
+                  : "cluster member");
+    if (raft->joining) {
+        ds_put_format(&s, "Remotes for joining:");
+        const char *address;
+        SSET_FOR_EACH (address, &raft->remote_addresses) {
+            ds_put_format(&s, " %s", address);
+        }
+        ds_put_char(&s, '\n');
+    }
+    if (raft->role == RAFT_LEADER) {
+        struct raft_server *as;
+        HMAP_FOR_EACH (as, hmap_node, &raft->add_servers) {
+            ds_put_format(&s, "Adding server "SID_FMT" %s (%s)\n",
+                          SID_ARGS(&as->sid), as->address,
+                          raft_server_phase_to_string(as->phase));
+        }
+
+        struct raft_server *rs = raft->remove_server;
+        if (rs) {
+            ds_put_format(&s, "Removing server "SID_FMT" %s (%s)\n",
+                          SID_ARGS(&rs->sid), rs->address,
+                          raft_server_phase_to_string(rs->phase));
+        }
+    }
+
+    ds_put_format(&s, "Role: %s\n",
+                  raft->role == RAFT_LEADER ? "leader"
+                  : raft->role == RAFT_CANDIDATE ? "candidate"
+                  : raft->role == RAFT_FOLLOWER ? "follower"
+                  : "<error>");
+    ds_put_format(&s, "Term: %"PRIu64"\n", raft->current_term);
+    raft_put_sid("Leader", &raft->leader_sid, raft, &s);
+    raft_put_sid("Voted for", &raft->voted_for, raft, &s);
+    ds_put_char(&s, '\n');
+
+    ds_put_format(&s, "Log: [%"PRIu64", %"PRIu64"]\n",
+                  raft->log_start, raft->log_end);
+
+    uint64_t n_uncommitted = raft->log_end - raft->commit_index - 1;
+    ds_put_format(&s, "Entries not yet committed: %"PRIu64"\n", n_uncommitted);
+
+    uint64_t n_unapplied = raft->log_end - raft->last_applied - 1;
+    ds_put_format(&s, "Entries not yet applied: %"PRIu64"\n", n_unapplied);
+
+    const struct raft_conn *c;
+    ds_put_cstr(&s, "Connections:");
+    LIST_FOR_EACH (c, list_node, &raft->conns) {
+        bool connected = jsonrpc_session_is_connected(c->js);
+        ds_put_format(&s, " %s%s"SID_FMT"%s",
+                      connected ? "" : "(",
+                      c->incoming ? "<-" : "->", SID_ARGS(&c->sid),
+                      connected ? "" : ")");
+    }
+    ds_put_char(&s, '\n');
+
+    ds_put_cstr(&s, "Servers:\n");
+    struct raft_server *server;
+    HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
+        ds_put_format(&s, "    "SID_FMT" at %s",
+                      SID_ARGS(&server->sid), server->address);
+        if (server == raft->me) {
+            ds_put_cstr(&s, " (me)");
+        }
+        if (server->phase != RAFT_PHASE_STABLE) {
+            ds_put_format (&s, " (%s)",
+                           raft_server_phase_to_string(server->phase));
+        }
+        if (raft->role == RAFT_CANDIDATE) {
+            if (!uuid_is_zero(&server->vote)) {
+                ds_put_format(&s, " (voted for "SID_FMT")",
+                              SID_ARGS(&server->vote));
+            }
+        } else if (raft->role == RAFT_LEADER) {
+            ds_put_format(&s, " next_index=%"PRIu64" match_index=%"PRIu64,
+                          server->next_index, server->match_index);
+        }
+        ds_put_char(&s, '\n');
+    }
+
+    unixctl_command_reply(conn, ds_cstr(&s));
+    ds_destroy(&s);
+}
+
+static void
+raft_unixctl_leave__(struct unixctl_conn *conn, struct raft *raft)
+{
+    if (raft_left(raft)) {
+        unixctl_command_reply(conn, NULL);
+    } else if (raft_is_leaving(raft)) {
+        unixctl_command_reply_error(conn,
+                                    "already in progress leaving cluster");
+    } else {
+        raft_leave(raft);
+        unixctl_command_reply(conn, NULL);
+    }
+}
+
+static void
+raft_unixctl_leave(struct unixctl_conn *conn, int argc, const char *argv[],
+                   void *aux OVS_UNUSED)
+{
+    bool force = argc > 2 && !strcmp(argv[1], "--force");
+    if (force) {
+        argc--;
+        argv++;
+    }
+    if (argc != 2) {
+        unixctl_command_reply_error(conn, "syntax error");
+        return;
+    }
+
+    struct raft *raft = raft_lookup_by_name(argv[1]);
+    if (!raft) {
+        unixctl_command_reply_error(conn, "unknown cluster");
+        return;
+    }
+
+    raft_unixctl_leave__(conn, raft);
+}
+
+static struct raft_server *
+raft_lookup_server_best_match(struct raft *raft, const char *id)
+{
+    struct raft_server *best = NULL;
+    int best_score = -1;
+    int n_best = 0;
+
+    struct raft_server *s;
+    HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
+        int score = (!strcmp(id, s->address)
+                     ? INT_MAX
+                     : uuid_is_partial_match(&s->sid, id));
+        if (score > best_score) {
+            best = s;
+            best_score = score;
+            n_best = 1;
+        } else if (score == best_score) {
+            n_best++;
+        }
+    }
+    return n_best == 1 ? best : NULL;
+}
+
+static void
+raft_unixctl_kick(struct unixctl_conn *conn, int argc OVS_UNUSED,
+                  const char *argv[], void *aux OVS_UNUSED)
+{
+    const char *cluster_name = argv[1];
+    const char *server_name = argv[2];
+
+    struct raft *raft = raft_lookup_by_name(cluster_name);
+    if (!raft) {
+        unixctl_command_reply_error(conn, "unknown cluster");
+        return;
+    }
+
+    struct raft_server *server = raft_lookup_server_best_match(raft,
+                                                               server_name);
+    if (!server) {
+        unixctl_command_reply_error(conn, "unknown server");
+        return;
+    }
+
+    if (server == raft->me) {
+        raft_unixctl_leave__(conn, raft);
+    } else if (raft->role == RAFT_LEADER) {
+        const struct raft_remove_server_request rq = {
+            .sid = server->sid,
+            .requester_conn = conn,
+        };
+        raft_handle_remove_server_request(raft, &rq);
+    } else {
+        const union raft_rpc rpc = {
+            .remove_server_request = {
+                .common = {
+                    .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
+                    .sid = raft->leader_sid,
+                    .comment = "via unixctl"
+                },
+                .sid = server->sid,
+            }
+        };
+        if (raft_send(raft, &rpc)) {
+            unixctl_command_reply(conn, "sent removal request to leader");
+        } else {
+            unixctl_command_reply_error(conn,
+                                        "failed to send removal request");
+        }
+    }
+}
+
+static void
+raft_init(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+    if (!ovsthread_once_start(&once)) {
+        return;
+    }
+    unixctl_command_register("cluster/cid", "DB", 1, 1,
+                             raft_unixctl_cid, NULL);
+    unixctl_command_register("cluster/sid", "DB", 1, 1,
+                             raft_unixctl_sid, NULL);
+    unixctl_command_register("cluster/status", "DB", 1, 1,
+                             raft_unixctl_status, NULL);
+    unixctl_command_register("cluster/leave", "[--force] DB", 1, 2,
+                             raft_unixctl_leave, NULL);
+    unixctl_command_register("cluster/kick", "DB SERVER", 2, 2,
+                             raft_unixctl_kick, NULL);
+    ovsthread_once_done(&once);
+}
diff --git a/ovsdb/raft.h b/ovsdb/raft.h
new file mode 100644
index 000000000000..3dd0911b1dbd
--- /dev/null
+++ b/ovsdb/raft.h
@@ -0,0 +1,142 @@ 
+/*
+ * Copyright (c) 2014, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RAFT_H
+#define RAFT_H 1
+
+#include <stddef.h>
+
+/* Implementation of the Raft consensus algorithm.
+ *
+ *
+ * References
+ * ==========
+ *
+ * Based on Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and
+ * Practice", available at https://ramcloud.stanford.edu/~ongaro/thesis.pdf.
+ * References to sections, pages, and figures are from this thesis.  Quotations
+ * in comments also come from this work, in accordance with its license notice,
+ * reproduced below:
+ *
+ *     Copyright 2014 by Diego Andres Ongaro. All Rights Reserved.
+ *
+ *     This work is licensed under a Creative Commons Attribution-3.0 United
+ *     States License.  http://creativecommons.org/licenses/by/3.0/us/
+ *
+ *
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "compiler.h"
+#include "uuid.h"
+
+struct json;
+struct ovsdb_log;
+struct raft;
+struct sset;
+
+#define RAFT_MAGIC "CLUSTER"
+
+/* Setting up a new cluster. */
+struct ovsdb_error *raft_create_cluster(const char *file_name,
+                                        const char *name,
+                                        const char *local_address,
+                                        const struct json *snapshot)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_error *raft_join_cluster(const char *file_name, const char *name,
+                                      const char *local_address,
+                                      const struct sset *remote_addrs,
+                                      const struct uuid *cid)
+    OVS_WARN_UNUSED_RESULT;
+
+struct raft_metadata {
+    struct uuid sid;            /* Server ID. */
+    struct uuid cid;            /* Cluster ID.  All-zeros if not yet known. */
+    char *name;                 /* Schema name. */
+    char *local;                /* Local address. */
+};
+struct ovsdb_error *raft_read_metadata(const char *file_name,
+                                       struct raft_metadata *);
+void raft_metadata_destroy(struct raft_metadata *);
+
+/* Starting up or shutting down a server within a cluster. */
+struct ovsdb_error *raft_open(const char *file_name, struct raft **)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_error *raft_open__(struct ovsdb_log *, struct raft **)
+    OVS_WARN_UNUSED_RESULT;
+void raft_close(struct raft *);
+
+/* Information. */
+const char *raft_get_name(const struct raft *);
+const struct uuid *raft_get_cid(const struct raft *);
+const struct uuid *raft_get_sid(const struct raft *);
+bool raft_is_connected(const struct raft *);
+bool raft_is_leader(const struct raft *);
+
+/* Joining and leaving a cluster */
+bool raft_is_joining(const struct raft *);
+
+void raft_leave(struct raft *);
+bool raft_is_leaving(const struct raft *);
+bool raft_left(const struct raft *);
+
+/* Running a server. */
+void raft_run(struct raft *);
+void raft_wait(struct raft *);
+
+/* Reading snapshots and log entries. */
+const struct json *raft_next_entry(struct raft *, struct uuid *eid,
+                                   bool *is_snapshot);
+bool raft_has_next_entry(const struct raft *);
+
+uint64_t raft_get_applied_index(const struct raft *);
+uint64_t raft_get_commit_index(const struct raft *);
+
+/* Writing log entries (executing commands). */
+enum raft_command_status {
+    RAFT_CMD_INCOMPLETE,        /* In progress, please wait. */
+    RAFT_CMD_SUCCESS,           /* Committed. */
+    RAFT_CMD_NOT_LEADER,        /* Failed because we are not the leader. */
+    RAFT_CMD_BAD_PREREQ,        /* Failed because prerequisite check failed. */
+    RAFT_CMD_LOST_LEADERSHIP,   /* Leadership lost after command initiation. */
+    RAFT_CMD_SHUTDOWN,          /* Raft server shut down. */
+    RAFT_CMD_IO_ERROR,          /* I/O error. */
+    RAFT_CMD_TIMEOUT,           /* Request to remote leader timed out. */
+};
+const char *raft_command_status_to_string(enum raft_command_status);
+bool raft_command_status_from_string(const char *, enum raft_command_status *);
+
+struct raft_command *raft_command_execute(struct raft *,
+                                          const struct json *data,
+                                          const struct uuid *prereq,
+                                          struct uuid *result)
+    OVS_WARN_UNUSED_RESULT;
+enum raft_command_status raft_command_get_status(const struct raft_command *);
+void raft_command_unref(struct raft_command *);
+void raft_command_wait(const struct raft_command *);
+
+/* Replacing the local log by a snapshot. */
+bool raft_has_grown(const struct raft *);
+struct ovsdb_error *raft_store_snapshot(struct raft *,
+                                        const struct json *new_snapshot)
+    OVS_WARN_UNUSED_RESULT;
+
+/* Cluster management. */
+void raft_take_leadership(struct raft *);
+void raft_transfer_leadership(struct raft *);
+
+#endif /* lib/raft.h */
diff --git a/ovsdb/replication.c b/ovsdb/replication.c
index 47b0af19bbf6..d5c683bc8517 100644
--- a/ovsdb/replication.c
+++ b/ovsdb/replication.c
@@ -1,5 +1,5 @@ 
 /*
- * (c) Copyright 2016 Hewlett Packard Enterprise Development LP
+ * (c) Copyright 2016, 2017 Hewlett Packard Enterprise Development LP
  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -536,7 +536,7 @@  reset_database(struct ovsdb *db)
         }
     }
 
-    return ovsdb_txn_commit(txn, false);
+    return ovsdb_txn_propose_commit_block(txn, false);
 }
 
 /* Create a monitor request for 'db'. The monitor request will include
@@ -615,7 +615,7 @@  process_notification(struct json *table_updates, struct ovsdb *db)
             return error;
         } else {
             /* Commit transaction. */
-            error = ovsdb_txn_commit(txn, false);
+            error = ovsdb_txn_propose_commit_block(txn, false);
         }
     }
 
diff --git a/ovsdb/row.c b/ovsdb/row.c
index 9c312947e539..755ab91a8c1c 100644
--- a/ovsdb/row.c
+++ b/ovsdb/row.c
@@ -44,6 +44,9 @@  allocate_row(const struct ovsdb_table *table)
     return row;
 }
 
+/* Creates and returns a new row suitable for insertion into 'table'.  Does not
+ * actually insert the row into 'table' (use ovsdb_txn_row_insert()).  The
+ * caller must assign a UUID to the row. */
 struct ovsdb_row *
 ovsdb_row_create(const struct ovsdb_table *table)
 {
diff --git a/ovsdb/server.c b/ovsdb/server.c
index 2a775230da6a..e1a497d78897 100644
--- a/ovsdb/server.c
+++ b/ovsdb/server.c
@@ -131,20 +131,14 @@  ovsdb_server_init(struct ovsdb_server *server)
 bool
 ovsdb_server_add_db(struct ovsdb_server *server, struct ovsdb *db)
 {
-    return shash_add_once(&server->dbs, db->schema->name, db);
+    return shash_add_once(&server->dbs, db->name, db);
 }
 
-/* Removes 'db' from the set of databases served out by 'server'.  Returns
- * true if successful, false if there is no db associated with
- * db->schema->name. */
-bool
+/* Removes 'db' from the set of databases served out by 'server'. */
+void
 ovsdb_server_remove_db(struct ovsdb_server *server, struct ovsdb *db)
 {
-    void *data = shash_find_and_delete(&server->dbs, db->schema->name);
-    if (data) {
-        return true;
-    }
-    return false;
+    shash_find_and_delete_assert(&server->dbs, db->name);
 }
 
 /* Destroys 'server'. */
diff --git a/ovsdb/server.h b/ovsdb/server.h
index 21bf1adde7af..6d997e608e66 100644
--- a/ovsdb/server.h
+++ b/ovsdb/server.h
@@ -86,7 +86,7 @@  struct ovsdb_server {
 
 void ovsdb_server_init(struct ovsdb_server *);
 bool ovsdb_server_add_db(struct ovsdb_server *, struct ovsdb *);
-bool ovsdb_server_remove_db(struct ovsdb_server *, struct ovsdb *);
+void ovsdb_server_remove_db(struct ovsdb_server *, struct ovsdb *);
 void ovsdb_server_destroy(struct ovsdb_server *);
 
 struct ovsdb_lock_waiter *ovsdb_server_lock(struct ovsdb_server *,
diff --git a/ovsdb/storage.c b/ovsdb/storage.c
new file mode 100644
index 000000000000..65e2d3908db4
--- /dev/null
+++ b/ovsdb/storage.c
@@ -0,0 +1,528 @@ 
+/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this storage except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "storage.h"
+#include <string.h>
+#include "log.h"
+#include "ovsdb-error.h"
+#include "openvswitch/json.h"
+#include "openvswitch/vlog.h"
+#include "poll-loop.h"
+#include "ovsdb.h"
+#include "raft.h"
+#include "random.h"
+#include "timeval.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(storage);
+
+struct ovsdb_storage {
+    /* There are three kinds of storage:
+     *
+     *    - Standalone, backed by a disk file.  'log' is nonnull, 'raft' is
+     *      null.
+     *
+     *    - Clustered, backed by a Raft cluster.  'log' is null, 'raft' is
+     *      nonnull.
+     *
+     *    - Memory only, unbacked.  'log' and 'raft' are null. */
+    struct ovsdb_log *log;
+    struct raft *raft;
+
+    /* All kinds of storage. */
+    struct ovsdb_error *error;  /* If nonnull, a permanent error. */
+    long long next_snapshot;    /* Time at which to take next snapshot. */
+
+    /* Standalone only. */
+    unsigned int n_read;
+    unsigned int n_written;
+};
+
+static long long int next_snapshot_time(bool quick);
+
+/* Opens 'filename' for use as storage.  If 'rw', opens it for read/write access,
+ * otherwise read-only.  If successful, stores the new storage in '*storagep'
+ * and returns NULL; on failure, stores NULL in '*storagep' and returns the
+ * error.
+ *
+ * The returned storage might be clustered or standalone, depending on what
+ * 'filename' contains. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_open(const char *filename, bool rw,
+                   struct ovsdb_storage **storagep)
+{
+    struct ovsdb_log *log;
+    struct ovsdb_error *error;
+    error = ovsdb_log_open(filename, OVSDB_MAGIC"|"RAFT_MAGIC,
+                           rw ? OVSDB_LOG_READ_WRITE : OVSDB_LOG_READ_ONLY,
+                           -1, &log);
+    if (error) {
+        return error;
+    }
+
+    struct raft *raft = NULL;
+    if (!strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) {
+        error = raft_open__(log, &raft);
+        log = NULL;
+        if (error) {
+            return error;
+        }
+    }
+
+    struct ovsdb_storage *storage = xzalloc(sizeof *storage);
+    storage->log = log;
+    storage->raft = raft;
+    storage->next_snapshot = next_snapshot_time(false);
+    *storagep = storage;
+    return NULL;
+}
+
+/* Creates and returns new storage without any backing.  Nothing will be read
+ * from the storage, and writes are discarded. */
+struct ovsdb_storage *
+ovsdb_storage_create_unbacked(void)
+{
+    struct ovsdb_storage *storage = xzalloc(sizeof *storage);
+    storage->next_snapshot = LLONG_MAX;
+    return storage;
+}
+
+void
+ovsdb_storage_close(struct ovsdb_storage *storage)
+{
+    if (storage) {
+        ovsdb_log_close(storage->log);
+        raft_close(storage->raft);
+        ovsdb_error_destroy(storage->error);
+        free(storage);
+    }
+}
+
+const char *
+ovsdb_storage_get_model(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? "clustered" : "standalone";
+}
+
+bool
+ovsdb_storage_is_clustered(const struct ovsdb_storage *storage)
+{
+    return storage->raft != NULL;
+}
+
+bool
+ovsdb_storage_is_connected(const struct ovsdb_storage *storage)
+{
+    return !storage->raft || raft_is_connected(storage->raft);
+}
+
+bool
+ovsdb_storage_is_dead(const struct ovsdb_storage *storage)
+{
+    return storage->raft && raft_left(storage->raft);
+}
+
+bool
+ovsdb_storage_is_leader(const struct ovsdb_storage *storage)
+{
+    return !storage->raft || raft_is_leader(storage->raft);
+}
+
+const struct uuid *
+ovsdb_storage_get_cid(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? raft_get_cid(storage->raft) : NULL;
+}
+
+const struct uuid *
+ovsdb_storage_get_sid(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? raft_get_sid(storage->raft) : NULL;
+}
+
+uint64_t
+ovsdb_storage_get_commit_index(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? raft_get_commit_index(storage->raft) : 0;
+}
+
+uint64_t
+ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? raft_get_applied_index(storage->raft) : 0;
+}
+
+void
+ovsdb_storage_run(struct ovsdb_storage *storage)
+{
+    if (storage->raft) {
+        raft_run(storage->raft);
+    }
+}
+
+void
+ovsdb_storage_wait(struct ovsdb_storage *storage)
+{
+    if (storage->raft) {
+        raft_wait(storage->raft);
+    }
+}
+
+/* Returns 'storage''s embedded name, if it has one, otherwise null.
+ *
+ * Only clustered storage has a built-in name.  */
+const char *
+ovsdb_storage_get_name(const struct ovsdb_storage *storage)
+{
+    return storage->raft ? raft_get_name(storage->raft) : NULL;
+}
+
+/* Attempts to read a log record from 'storage'.
+ *
+ * If successful, returns NULL and stores in '*jsonp' the JSON object that the
+ * record contains.  The caller owns the data and must eventually free it (with
+ * json_destroy()).
+ *
+ * If a read error occurs, returns the error and stores NULL in '*jsonp'.
+ *
+ * If the read reaches end of file, returns NULL and stores NULL in
+ * '*jsonp'. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_read(struct ovsdb_storage *storage,
+                   struct ovsdb_schema **schemap,
+                   struct json **txnp,
+                   struct uuid *txnid)
+{
+    *schemap = NULL;
+    *txnp = NULL;
+    if (txnid) {
+        *txnid = UUID_ZERO;
+    }
+
+    struct json *json;
+    struct json *schema_json = NULL;
+    struct json *txn_json = NULL;
+    if (storage->raft) {
+        bool is_snapshot;
+        json = json_nullable_clone(
+            raft_next_entry(storage->raft, txnid, &is_snapshot));
+        if (!json) {
+            return NULL;
+        } else if (json->type != JSON_ARRAY || json->u.array.n != 2) {
+            json_destroy(json);
+            return ovsdb_error(NULL, "invalid commit format");
+        }
+
+        struct json **e = json->u.array.elems;
+        schema_json = e[0]->type != JSON_NULL ? e[0] : NULL;
+        txn_json = e[1]->type != JSON_NULL ? e[1] : NULL;
+    } else if (storage->log) {
+        struct ovsdb_error *error = ovsdb_log_read(storage->log, &json);
+        if (error || !json) {
+            return error;
+        }
+
+        unsigned int n = storage->n_read++;
+        struct json **jsonp = !n ? &schema_json : &txn_json;
+        *jsonp = json;
+        if (n == 1) {
+            ovsdb_log_mark_base(storage->log);
+        }
+    } else {
+        /* Unbacked.  Nothing to do. */
+        return NULL;
+    }
+
+    /* If we got this far then we must have at least a schema or a
+     * transaction. */
+    ovs_assert(schema_json || txn_json);
+
+    if (schema_json) {
+        struct ovsdb_schema *schema;
+        struct ovsdb_error *error = ovsdb_schema_from_json(schema_json,
+                                                           &schema);
+        if (error) {
+            json_destroy(json);
+            return error;
+        }
+
+        const char *storage_name = ovsdb_storage_get_name(storage);
+        const char *schema_name = schema->name;
+        if (storage_name && strcmp(storage_name, schema_name)) {
+            error = ovsdb_error(NULL, "name %s in header does not match "
+                                "name %s in schema",
+                                storage_name, schema_name);
+            json_destroy(json);
+            ovsdb_schema_destroy(schema);
+            return error;
+        }
+
+        *schemap = schema;
+    }
+
+    if (txn_json) {
+        *txnp = json_clone(txn_json);
+    }
+
+    json_destroy(json);
+    return NULL;
+}
+
+bool
+ovsdb_storage_read_wait(struct ovsdb_storage *storage)
+{
+    if (storage->raft) {
+        return raft_has_next_entry(storage->raft);
+    } else {
+        /* XXX */
+        return false;
+    }
+}
+
+void
+ovsdb_storage_unread(struct ovsdb_storage *storage)
+{
+    if (storage->error) {
+        return;
+    }
+
+    if (storage->raft) {
+        if (!storage->error) {
+            storage->error = ovsdb_error(NULL, "inconsistent data");
+        }
+    } else if (storage->log) {
+        ovsdb_log_unread(storage->log);
+    }
+}
+
+struct ovsdb_write {
+    struct ovsdb_error *error;
+    struct raft_command *command;
+};
+
+/* Not suitable for writing transactions that change the schema. */
+struct ovsdb_write * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_write(struct ovsdb_storage *storage, const struct json *data,
+                    const struct uuid *prereq, struct uuid *resultp,
+                    bool durable)
+{
+    struct ovsdb_write *w = xzalloc(sizeof *w);
+    struct uuid result = UUID_ZERO;
+    if (storage->error) {
+        w->error = ovsdb_error_clone(storage->error);
+    } else if (storage->raft) {
+        struct json *txn_json = json_array_create_2(json_null_create(),
+                                                    json_clone(data));
+        w->command = raft_command_execute(storage->raft, txn_json,
+                                          prereq, &result);
+        json_destroy(txn_json);
+    } else if (storage->log) {
+        w->error = ovsdb_log_write(storage->log, data);
+        if (!w->error) {
+            storage->n_written++;
+            if (durable) {
+                w->error = ovsdb_log_commit(storage->log);
+            }
+        }
+    } else {
+        /* When 'error' and 'command' are both null, it indicates that the
+         * command is complete.  This is fine since this unbacked storage drops
+         * writes. */
+    }
+    if (resultp) {
+        *resultp = result;
+    }
+    return w;
+}
+
+/* Not suitable for writing transactions that change the schema. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_write_block(struct ovsdb_storage *storage,
+                          const struct json *data, const struct uuid *prereq,
+                          struct uuid *resultp, bool durable)
+{
+    struct ovsdb_write *w = ovsdb_storage_write(storage, data,
+                                                prereq, resultp, durable);
+    while (!ovsdb_write_is_complete(w)) {
+        if (storage->raft) {
+            raft_run(storage->raft);
+        }
+
+        ovsdb_write_wait(w);
+        if (storage->raft) {
+            raft_wait(storage->raft);
+        }
+        poll_block();
+    }
+
+    struct ovsdb_error *error = ovsdb_error_clone(ovsdb_write_get_error(w));
+    ovsdb_write_destroy(w);
+    return error;
+}
+
+bool
+ovsdb_write_is_complete(const struct ovsdb_write *w)
+{
+    return (w->error
+            || !w->command
+            || raft_command_get_status(w->command) != RAFT_CMD_INCOMPLETE);
+}
+
+const struct ovsdb_error *
+ovsdb_write_get_error(const struct ovsdb_write *w_)
+{
+    struct ovsdb_write *w = CONST_CAST(struct ovsdb_write *, w_);
+    if (w->command) {
+        enum raft_command_status status = raft_command_get_status(w->command);
+        ovs_assert(status != RAFT_CMD_INCOMPLETE);
+        if (status != RAFT_CMD_SUCCESS) {
+            w->error = ovsdb_error("cluster error", "%s",
+                                   raft_command_status_to_string(status));
+        }
+        raft_command_unref(w->command);
+        w->command = NULL;
+    }
+
+    return w->error;
+}
+
+void
+ovsdb_write_wait(const struct ovsdb_write *w)
+{
+    if (ovsdb_write_is_complete(w)) {
+        poll_immediate_wake();
+    }
+}
+
+void
+ovsdb_write_destroy(struct ovsdb_write *w)
+{
+    if (w) {
+        raft_command_unref(w->command);
+        ovsdb_error_destroy(w->error);
+        free(w);
+    }
+}
+
+static long long int
+next_snapshot_time(bool quick)
+{
+    unsigned int base = 10 * 60 * 1000;  /* 10 minutes */
+    unsigned int range = 10 * 60 * 1000; /* 10 minutes */
+    if (quick) {
+        base /= 10;
+        range /= 10;
+    }
+
+    return time_msec() + base + random_range(range);
+}
+
+bool
+ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage)
+{
+    if (time_msec() < storage->next_snapshot) {
+        return false;
+    }
+
+    if (storage->raft) {
+        return raft_has_grown(storage->raft);
+    } else if (storage->log) {
+        return (storage->n_read + storage->n_written >= 100
+                && ovsdb_log_has_grown(storage->log));
+    }
+
+    return false;
+}
+
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_store_snapshot__(struct ovsdb_storage *storage,
+                               const struct json *schema,
+                               const struct json *data)
+{
+    if (storage->raft) {
+        struct json *entries = json_array_create_empty();
+        if (schema) {
+            json_array_add(entries, json_clone(schema));
+        }
+        if (data) {
+            json_array_add(entries, json_clone(data));
+        }
+        struct ovsdb_error *error = raft_store_snapshot(storage->raft,
+                                                        entries);
+        json_destroy(entries);
+        return error;
+    } else if (storage->log) {
+        struct json *entries[2];
+        size_t n = 0;
+        if (schema) {
+            entries[n++] = CONST_CAST(struct json *, schema);
+        }
+        if (data) {
+            entries[n++] = CONST_CAST(struct json *, data);
+        }
+        return ovsdb_log_replace(storage->log, entries, n);
+    } else {
+        return NULL;
+    }
+}
+
+/* 'schema' and 'data' should faithfully represent the current schema and data,
+ * otherwise the two storing backing formats will yield divergent results.  Use
+ * ovsdb_storage_write_schema_change() to change the schema. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_store_snapshot(struct ovsdb_storage *storage,
+                             const struct json *schema,
+                             const struct json *data)
+{
+    struct ovsdb_error *error = ovsdb_storage_store_snapshot__(storage,
+                                                               schema, data);
+    bool retry_quickly = error != NULL;
+    storage->next_snapshot = next_snapshot_time(retry_quickly);
+    return error;
+}
+
+struct ovsdb_write * OVS_WARN_UNUSED_RESULT
+ovsdb_storage_write_schema_change(struct ovsdb_storage *storage,
+                                  const struct json *schema,
+                                  const struct json *data,
+                                  const struct uuid *prereq,
+                                  struct uuid *resultp)
+{
+    struct ovsdb_write *w = xzalloc(sizeof *w);
+    struct uuid result = UUID_ZERO;
+    if (storage->error) {
+        w->error = ovsdb_error_clone(storage->error);
+    } else if (storage->raft) {
+        struct json *txn_json = json_array_create_2(json_clone(schema),
+                                                    json_clone(data));
+        w->command = raft_command_execute(storage->raft, txn_json,
+                                          prereq, &result);
+        json_destroy(txn_json);
+    } else if (storage->log) {
+        w->error = ovsdb_storage_store_snapshot__(storage, schema, data);
+    } else {
+        /* When 'error' and 'command' are both null, it indicates that the
+         * command is complete.  This is fine since this unbacked storage drops
+         * writes. */
+    }
+    if (resultp) {
+        *resultp = result;
+    }
+    return w;
+
+
+}
diff --git a/ovsdb/storage.h b/ovsdb/storage.h
new file mode 100644
index 000000000000..23477b7e6dcf
--- /dev/null
+++ b/ovsdb/storage.h
@@ -0,0 +1,88 @@ 
+/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this storage except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef OVSDB_STORAGE_H
+#define OVSDB_STORAGE_H 1
+
+#include <stdint.h>
+#include <sys/types.h>
+#include "compiler.h"
+
+struct json;
+struct ovsdb_schema;
+struct ovsdb_storage;
+struct ovsdb_completion;
+struct uuid;
+
+struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw,
+                                       struct ovsdb_storage **)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_storage *ovsdb_storage_create_unbacked(void);
+void ovsdb_storage_close(struct ovsdb_storage *);
+
+const char *ovsdb_storage_get_model(const struct ovsdb_storage *);
+bool ovsdb_storage_is_clustered(const struct ovsdb_storage *);
+bool ovsdb_storage_is_connected(const struct ovsdb_storage *);
+bool ovsdb_storage_is_dead(const struct ovsdb_storage *);
+bool ovsdb_storage_is_leader(const struct ovsdb_storage *);
+const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *);
+const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *);
+uint64_t ovsdb_storage_get_commit_index(const struct ovsdb_storage *);
+uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *);
+
+void ovsdb_storage_run(struct ovsdb_storage *);
+void ovsdb_storage_wait(struct ovsdb_storage *);
+
+const char *ovsdb_storage_get_name(const struct ovsdb_storage *);
+
+struct ovsdb_error *ovsdb_storage_read(struct ovsdb_storage *,
+                                       struct ovsdb_schema **schemap,
+                                       struct json **txnp,
+                                       struct uuid *txnid)
+    OVS_WARN_UNUSED_RESULT;
+bool ovsdb_storage_read_wait(struct ovsdb_storage *);
+
+void ovsdb_storage_unread(struct ovsdb_storage *);
+
+struct ovsdb_write *ovsdb_storage_write(struct ovsdb_storage *,
+                                        const struct json *,
+                                        const struct uuid *prereq,
+                                        struct uuid *result,
+                                        bool durable)
+    OVS_WARN_UNUSED_RESULT;
+struct ovsdb_error *ovsdb_storage_write_block(struct ovsdb_storage *,
+                                              const struct json *,
+                                              const struct uuid *prereq,
+                                              struct uuid *result,
+                                              bool durable);
+
+bool ovsdb_write_is_complete(const struct ovsdb_write *);
+const struct ovsdb_error *ovsdb_write_get_error(const struct ovsdb_write *);
+void ovsdb_write_wait(const struct ovsdb_write *);
+void ovsdb_write_destroy(struct ovsdb_write *);
+
+bool ovsdb_storage_should_snapshot(const struct ovsdb_storage *);
+struct ovsdb_error *ovsdb_storage_store_snapshot(struct ovsdb_storage *storage,
+                                                 const struct json *schema,
+                                                 const struct json *snapshot)
+    OVS_WARN_UNUSED_RESULT;
+
+struct ovsdb_write *ovsdb_storage_write_schema_change(
+    struct ovsdb_storage *,
+    const struct json *schema, const struct json *data,
+    const struct uuid *prereq, struct uuid *result)
+    OVS_WARN_UNUSED_RESULT;
+
+#endif /* ovsdb/storage.h */
diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c
index e38a174109bb..5b948e668827 100644
--- a/ovsdb/transaction.c
+++ b/ovsdb/transaction.c
@@ -25,13 +25,17 @@ 
 #include "openvswitch/hmap.h"
 #include "openvswitch/json.h"
 #include "openvswitch/list.h"
+#include "openvswitch/vlog.h"
 #include "ovsdb-error.h"
 #include "ovsdb.h"
+#include "poll-loop.h"
 #include "row.h"
+#include "storage.h"
 #include "table.h"
-#include "perf-counter.h"
 #include "uuid.h"
 
+VLOG_DEFINE_THIS_MODULE(transaction);
+
 struct ovsdb_txn {
     struct ovsdb *db;
     struct ovs_list txn_tables; /* Contains "struct ovsdb_txn_table"s. */
@@ -812,8 +816,8 @@  ovsdb_txn_is_empty(const struct ovsdb_txn *txn)
     return ovs_list_is_empty(&txn->txn_tables);
 }
 
-struct ovsdb_error * OVS_WARN_UNUSED_RESULT
-ovsdb_txn_start_commit(struct ovsdb_txn *txn)
+static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_txn_precommit(struct ovsdb_txn *txn)
 {
     struct ovsdb_error *error;
 
@@ -823,7 +827,7 @@  ovsdb_txn_start_commit(struct ovsdb_txn *txn)
     if (error) {
         return OVSDB_WRAP_BUG("can't happen", error);
     }
-    if (ovsdb_txn_is_empty(txn)) {
+    if (ovs_list_is_empty(&txn->txn_tables)) {
         return NULL;
     }
 
@@ -864,41 +868,197 @@  ovsdb_txn_start_commit(struct ovsdb_txn *txn)
         return OVSDB_WRAP_BUG("can't happen", error);
     }
 
-    return NULL;
+    return error;
 }
 
-struct ovsdb_error *
-ovsdb_txn_finish_commit(struct ovsdb_txn *txn, bool durable)
+/* Finalize commit. */
+void
+ovsdb_txn_complete(struct ovsdb_txn *txn)
 {
-    /* Send the commit to each replica. */
-    if (txn->db->file) {
-        struct ovsdb_error *error = ovsdb_file_commit(txn->db->file, txn,
-                                                      durable);
-        if (error) {
-            ovsdb_txn_abort(txn);
-            return error;
-        }
+    if (ovsdb_txn_is_empty(txn)) {
+        return;
     }
-    ovsdb_monitors_commit(txn->db, txn);
-
-    /* Finalize commit. */
     txn->db->run_triggers = true;
+    ovsdb_monitors_commit(txn->db, txn);
     ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs));
     ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit));
     ovsdb_txn_free(txn);
-
-    return NULL;
 }
 
+/* Applies 'txn' to the internal representation of the database.  This is for
+ * transactions that don't need to be written to storage; probably, they came
+ * from storage.  These transactions shouldn't ordinarily fail because storage
+ * should contain only consistent transactions.  (One exception is for database
+ * conversion in ovsdb_convert().) */
 struct ovsdb_error *
-ovsdb_txn_commit(struct ovsdb_txn *txn, bool durable)
+ovsdb_txn_replay_commit(struct ovsdb_txn *txn)
 {
-    struct ovsdb_error *error = ovsdb_txn_start_commit(txn);
-    if (error || ovsdb_txn_is_empty(txn)) {
+    struct ovsdb_error *error = ovsdb_txn_precommit(txn);
+    if (error) {
         ovsdb_txn_abort(txn);
-        return error;
+    } else {
+        ovsdb_txn_complete(txn);
+    }
+    return error;
+}
+
+/* If 'error' is nonnull, the transaction is complete, with the given error as
+ * the result.
+ *
+ * Otherwise, if 'write' is nonnull, then the transaction is waiting for
+ * 'write' to complete.
+ *
+ * Otherwise, if 'commit_index' is nonzero, then the transaction is waiting for
+ * 'commit_index' to be applied to the storage.
+ *
+ * Otherwise, the transaction is complete and successful. */
+struct ovsdb_txn_progress {
+    struct ovsdb_error *error;
+    struct ovsdb_write *write;
+    uint64_t commit_index;
+
+    struct ovsdb_storage *storage;
+};
+
+struct ovsdb_txn_progress *
+ovsdb_txn_propose_schema_change(struct ovsdb *db,
+                                const struct json *schema,
+                                const struct json *data)
+{
+    struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress);
+    progress->storage = db->storage;
+
+    struct uuid next;
+    struct ovsdb_write *write = ovsdb_storage_write_schema_change(
+        db->storage, schema, data, &db->prereq, &next);
+    //txn->db->prereq = next;     /* XXX */
+    if (!ovsdb_write_is_complete(write)) {
+        progress->write = write;
+    } else {
+        progress->error = ovsdb_error_clone(ovsdb_write_get_error(write));
+        ovsdb_write_destroy(write);
+    }
+    return progress;
+}
+
+struct ovsdb_txn_progress *
+ovsdb_txn_propose_commit(struct ovsdb_txn *txn, bool durable)
+{
+    struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress);
+    progress->storage = txn->db->storage;
+    progress->error = ovsdb_txn_precommit(txn);
+    if (progress->error) {
+        return progress;
+    }
+
+    /* Turn the commit into the format used for the storage logs.. */
+    struct json *txn_json = ovsdb_file_txn_to_json(txn);
+    if (!txn_json) {
+        /* Nothing to do, so success. */
+        return progress;
+    }
+    txn_json = ovsdb_file_txn_annotate(txn_json, ovsdb_txn_get_comment(txn));
+
+    struct uuid next;
+    struct ovsdb_write *write = ovsdb_storage_write(
+        txn->db->storage, txn_json, &txn->db->prereq, &next, durable);
+    json_destroy(txn_json);
+    //txn->db->prereq = next;     /* XXX */
+    if (!ovsdb_write_is_complete(write)) {
+        progress->write = write;
+    } else {
+        progress->error = ovsdb_error_clone(ovsdb_write_get_error(write));
+        ovsdb_write_destroy(write);
+    }
+    return progress;
+}
+
+/* Proposes 'txn' for commitment and then waits for the commit to succeed or
+ * fail  Returns null if successful, otherwise the error.
+ *
+ * **In addition**, this function also completes or aborts the transaction if
+ * the transaction succeeded or failed, respectively. */
+struct ovsdb_error * OVS_WARN_UNUSED_RESULT
+ovsdb_txn_propose_commit_block(struct ovsdb_txn *txn, bool durable)
+{
+    struct ovsdb_txn_progress *p = ovsdb_txn_propose_commit(txn, durable);
+    for (;;) {
+        ovsdb_storage_run(p->storage);
+        if (ovsdb_txn_progress_is_complete(p)) {
+            struct ovsdb_error *error
+                = ovsdb_error_clone(ovsdb_txn_progress_get_error(p));
+            ovsdb_txn_progress_destroy(p);
+
+            if (error) {
+                ovsdb_txn_abort(txn);
+            } else {
+                ovsdb_txn_complete(txn);
+            }
+
+            return error;
+        }
+        ovsdb_storage_wait(p->storage);
+        poll_block();
+    }
+}
+
+static void
+ovsdb_txn_progress_run(struct ovsdb_txn_progress *p)
+{
+    if (p->error) {
+        return;
+    }
+
+    if (p->write) {
+        if (!ovsdb_write_is_complete(p->write)) {
+            return;
+        }
+        p->error = ovsdb_error_clone(ovsdb_write_get_error(p->write));
+        ovsdb_write_destroy(p->write);
+        p->write = NULL;
+
+        if (p->error) {
+            return;
+        }
+
+        p->commit_index = ovsdb_storage_get_commit_index(p->storage);
+    }
+
+    if (p->commit_index) {
+        if (ovsdb_storage_get_applied_index(p->storage) >= p->commit_index) {
+            p->commit_index = 0;
+        }
+    }
+}
+
+static bool
+ovsdb_txn_progress_is_complete__(const struct ovsdb_txn_progress *p)
+{
+    return p->error || (!p->write && !p->commit_index);
+}
+
+bool
+ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *p)
+{
+    ovsdb_txn_progress_run(CONST_CAST(struct ovsdb_txn_progress *, p));
+    return ovsdb_txn_progress_is_complete__(p);
+}
+
+const struct ovsdb_error *
+ovsdb_txn_progress_get_error(const struct ovsdb_txn_progress *p)
+{
+    ovs_assert(ovsdb_txn_progress_is_complete__(p));
+    return p->error;
+}
+
+void
+ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *p)
+{
+    if (p) {
+        ovsdb_error_destroy(p->error);
+        ovsdb_write_destroy(p->write);
+        free(p);
     }
-    return ovsdb_txn_finish_commit(txn, durable);
 }
 
 void
diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h
index f9b886411bf4..32384fcd3502 100644
--- a/ovsdb/transaction.h
+++ b/ovsdb/transaction.h
@@ -1,4 +1,4 @@ 
-/* Copyright (c) 2009, 2010 Nicira, Inc.
+/* Copyright (c) 2009, 2010, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,12 +27,23 @@  struct uuid;
 struct ovsdb_txn *ovsdb_txn_create(struct ovsdb *);
 void ovsdb_txn_abort(struct ovsdb_txn *);
 
-struct ovsdb_error *ovsdb_txn_start_commit(struct ovsdb_txn *)
+struct ovsdb_error *ovsdb_txn_replay_commit(struct ovsdb_txn *)
     OVS_WARN_UNUSED_RESULT;
-struct ovsdb_error *ovsdb_txn_finish_commit(struct ovsdb_txn *, bool durable)
+struct ovsdb_txn_progress *ovsdb_txn_propose_commit(struct ovsdb_txn *,
+                                                    bool durable)
     OVS_WARN_UNUSED_RESULT;
-struct ovsdb_error *ovsdb_txn_commit(struct ovsdb_txn *, bool durable)
+struct ovsdb_error *ovsdb_txn_propose_commit_block(struct ovsdb_txn *,
+                                                   bool durable)
     OVS_WARN_UNUSED_RESULT;
+void ovsdb_txn_complete(struct ovsdb_txn *);
+
+struct ovsdb_txn_progress *ovsdb_txn_propose_schema_change(
+    struct ovsdb *, const struct json *schema, const struct json *data);
+
+bool ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *);
+const struct ovsdb_error *ovsdb_txn_progress_get_error(
+    const struct ovsdb_txn_progress *);
+void ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *);
 
 struct ovsdb_row *ovsdb_txn_row_modify(struct ovsdb_txn *,
                                        const struct ovsdb_row *);
diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c
index 10b155f8bcb4..eed94e635d49 100644
--- a/ovsdb/trigger.c
+++ b/ovsdb/trigger.c
@@ -20,16 +20,19 @@ 
 #include <limits.h>
 
 #include "file.h"
-#include "log.h"
 #include "openvswitch/json.h"
 #include "jsonrpc.h"
 #include "ovsdb.h"
 #include "ovsdb-error.h"
 #include "poll-loop.h"
 #include "server.h"
+#include "transaction.h"
+#include "openvswitch/vlog.h"
 
+VLOG_DEFINE_THIS_MODULE(trigger);
 
 static bool ovsdb_trigger_try(struct ovsdb_trigger *, long long int now);
+static void ovsdb_trigger_complete(struct ovsdb_trigger *);
 static void trigger_error(struct ovsdb_trigger *, struct ovsdb_error *);
 static void trigger_success(struct ovsdb_trigger *, struct json *result);
 
@@ -46,6 +49,7 @@  ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db,
     ovs_list_push_back(&trigger->db->triggers, &trigger->node);
     trigger->request = request;
     trigger->reply = NULL;
+    trigger->progress = NULL;
     trigger->created = now;
     trigger->timeout_msec = LLONG_MAX;
     trigger->read_only = read_only;
@@ -57,6 +61,7 @@  ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db,
 void
 ovsdb_trigger_destroy(struct ovsdb_trigger *trigger)
 {
+    ovsdb_txn_progress_destroy(trigger->progress);
     ovs_list_remove(&trigger->node);
     jsonrpc_msg_destroy(trigger->request);
     jsonrpc_msg_destroy(trigger->reply);
@@ -67,7 +72,7 @@  ovsdb_trigger_destroy(struct ovsdb_trigger *trigger)
 bool
 ovsdb_trigger_is_complete(const struct ovsdb_trigger *trigger)
 {
-    return trigger->reply != NULL;
+    return trigger->reply && !trigger->progress;
 }
 
 struct jsonrpc_msg *
@@ -81,18 +86,28 @@  ovsdb_trigger_steal_reply(struct ovsdb_trigger *trigger)
 void
 ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger)
 {
-    if (!strcmp(trigger->request->method, "transact")) {
-        trigger_error(trigger, ovsdb_error("canceled", NULL));
-    } else if (!strcmp(trigger->request->method, "convert_db")) {
-        /* We don't cancel convert_db requests when a database is being
-         * replaced for two reasons.  First, we expect the administrator to do
-         * some kind of sensible synchronization on conversion requests, that
-         * is, it only really makes sense for the admin to do a single
-         * conversion at a time at a scheduled point.  Second, if we did then
-         * every convert_db request would end up getting canceled since
-         * convert_db itself causes the database to be replaced. */
-    } else {
-        OVS_NOT_REACHED();
+    if (!ovsdb_trigger_is_complete(trigger)) {
+        if (!strcmp(trigger->request->method, "transact")) {
+            if (trigger->progress) {
+                /* XXX The transaction still might complete asynchronously. */
+                ovsdb_txn_progress_destroy(trigger->progress);
+                trigger->progress = NULL;
+
+                jsonrpc_msg_destroy(trigger->reply);
+                trigger->reply = NULL;
+            }
+            trigger_error(trigger, ovsdb_error("canceled", NULL));
+        } else if (!strcmp(trigger->request->method, "convert_db")) {
+            /* We don't cancel convert_db requests when a database is being
+             * replaced for two reasons.  First, we expect the administrator to
+             * do some kind of sensible synchronization on conversion requests,
+             * that is, it only really makes sense for the admin to do a single
+             * conversion at a time at a scheduled point.  Second, if we did
+             * then every convert_db request would end up getting canceled
+             * since convert_db itself causes the database to be replaced. */
+        } else {
+            OVS_NOT_REACHED();
+        }
     }
 }
 
@@ -107,7 +122,9 @@  ovsdb_trigger_run(struct ovsdb *db, long long int now)
     bool disconnect_all = false;
 
     LIST_FOR_EACH_SAFE (t, next, node, &db->triggers) {
-        if (run_triggers || now - t->created >= t->timeout_msec) {
+        if (run_triggers
+            || now - t->created >= t->timeout_msec
+            || t->progress) {
             if (ovsdb_trigger_try(t, now)) {
                 disconnect_all = true;
             }
@@ -146,56 +163,155 @@  ovsdb_trigger_wait(struct ovsdb *db, long long int now)
 static bool
 ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now)
 {
-    if (!strcmp(t->request->method, "transact")) {
-        struct json *result = ovsdb_execute(t->db, t->session,
-                                            t->request->params, t->read_only,
-                                            t->role, t->id, now - t->created,
-                                            &t->timeout_msec);
-        if (result) {
-            trigger_success(t, result);
+    /* Handle "initialized" state. */
+    if (!t->reply) {
+        ovs_assert(!t->progress);
+
+        struct ovsdb_txn *txn = NULL;
+        struct ovsdb *newdb = NULL;
+        if (!strcmp(t->request->method, "transact")) {
+            bool durable;
+
+            struct json *result;
+            txn = ovsdb_execute_compose(
+                t->db, t->session, t->request->params, t->read_only,
+                t->role, t->id, now - t->created, &t->timeout_msec,
+                &durable, &result);
+            if (!txn) {
+                if (result) {
+                    /* Complete.  There was an error but we still represent it
+                     * in JSON-RPC as a successful result. */
+                    trigger_success(t, result);
+                } else {
+                    /* Unsatisfied "wait" condition.  Take no action now, retry
+                     * later. */
+                }
+                return false;
+            }
+
+            /* Transition to "committing" state. */
+            t->reply = jsonrpc_create_reply(result, t->request->id);
+            t->progress = ovsdb_txn_propose_commit(txn, durable);
+        } else if (!strcmp(t->request->method, "convert_db")) {
+            /* Validate parameters. */
+            const struct json *params = t->request->params;
+            if (params->type != JSON_ARRAY || params->u.array.n != 2) {
+                trigger_error(t, ovsdb_syntax_error(params, NULL,
+                                                    "array expected"));
+                return false;
+            }
+
+            /* Parse new schema and make a converted copy. */
+            const struct json *new_schema_json = params->u.array.elems[1];
+            struct ovsdb_schema *new_schema;
+            struct ovsdb_error *error
+                = ovsdb_schema_from_json(new_schema_json, &new_schema);
+            if (!error && strcmp(new_schema->name, t->db->schema->name)) {
+                error = ovsdb_error("invalid parameters",
+                                    "new schema name (%s) does not match "
+                                    "database name (%s)",
+                                    new_schema->name, t->db->schema->name);
+            }
+            if (!error) {
+                error = ovsdb_convert(t->db, new_schema, &newdb);
+            }
+            if (error) {
+                ovsdb_schema_destroy(new_schema);
+                trigger_error(t, error);
+                return false;
+            }
+
+            /* Make the new copy into a transaction log record. */
+            struct json *txn_json = ovsdb_to_txn_json(
+                newdb, "converted by ovsdb-server");
+
+            /* Propose the change. */
+            t->progress = ovsdb_txn_propose_schema_change(
+                t->db, new_schema_json, txn_json);
+            json_destroy(txn_json);
+            t->reply = jsonrpc_create_reply(json_object_create(),
+                                            t->request->id);
+        } else {
+            OVS_NOT_REACHED();
         }
-        return false;
-    } else if (!strcmp(t->request->method, "convert_db")) {
-        /* Validate parameters. */
-        const struct json *params = t->request->params;
-        if (params->type != JSON_ARRAY || params->u.array.n != 2) {
-            trigger_error(t, ovsdb_syntax_error(params, NULL,
-                                                "array expected"));
+
+        /* If the transaction committed synchronously, complete it and
+         * transition to "complete".  This is more than an optimization because
+         * the file-based storage isn't implemented to read back the
+         * transactions that we write (which is an ugly broken abstraction but
+         * it's what we have). */
+        if (ovsdb_txn_progress_is_complete(t->progress)
+            && !ovsdb_txn_progress_get_error(t->progress)) {
+            if (txn) {
+                ovsdb_txn_complete(txn);
+            }
+            ovsdb_txn_progress_destroy(t->progress);
+            t->progress = NULL;
+            ovsdb_trigger_complete(t);
+            if (newdb) {
+                ovsdb_replace(t->db, newdb);
+                return true;
+            }
             return false;
         }
+        ovsdb_destroy(newdb);
 
-        /* Parse new schema and make a converted copy. */
-        const struct json *new_schema_json = params->u.array.elems[1];
-        struct ovsdb_schema *new_schema;
-        struct ovsdb_error *error = ovsdb_schema_from_json(new_schema_json,
-                                                           &new_schema);
-        if (!error && strcmp(new_schema->name, t->db->schema->name)) {
-            error = ovsdb_error(
-                "invalid parameters",
-                "new schema name (%s) does not match database name (%s)",
-                new_schema->name, t->db->schema->name);
+        /* Fall through to the general handling for the "committing" state.  We
+         * abort the transaction--if and when it eventually commits, we'll read
+         * it back from storage and replay it locally. */
+        if (txn) {
+            ovsdb_txn_abort(txn);
         }
-        if (!error) {
-            error = ovsdb_file_convert(t->db->file, new_schema);
+    }
+
+    /* Handle "committing" state. */
+    if (t->progress) {
+        if (!ovsdb_txn_progress_is_complete(t->progress)) {
+            return false;
         }
-        ovsdb_schema_destroy(new_schema);
+
+        /* Transition to "complete". */
+        struct ovsdb_error *error
+            = ovsdb_error_clone(ovsdb_txn_progress_get_error(t->progress));
+        ovsdb_txn_progress_destroy(t->progress);
+        t->progress = NULL;
+
         if (error) {
-            trigger_error(t, error);
-            return false;
+            if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) {
+                /* Temporary error.  Transition back to "initialized" state to
+                 * try again. */
+                jsonrpc_msg_destroy(t->reply);
+                t->reply = NULL;
+                t->db->run_triggers = true; /* XXX? */
+                ovsdb_error_destroy(error);
+            } else {
+                /* Permanent error.  Transition to "completed" state to report
+                 * it. */
+                if (!strcmp(t->request->method, "transact")) {
+                    json_array_add(t->reply->result,
+                                   ovsdb_error_to_json_free(error));
+                    ovsdb_trigger_complete(t);
+                } else if (!strcmp(t->request->method, "convert_db")) {
+                    jsonrpc_msg_destroy(t->reply);
+                    t->reply = NULL;
+                    trigger_error(t, error);
+                }
+            }
+        } else {
+            /* Success. */
+            ovsdb_trigger_complete(t);
         }
 
-        trigger_success(t, json_object_create());
-        return true;
-    } else {
-        OVS_NOT_REACHED();
+        return false;
     }
+
+    OVS_NOT_REACHED();
 }
 
 static void
-ovsdb_trigger_complete(struct ovsdb_trigger *t, struct jsonrpc_msg *reply)
+ovsdb_trigger_complete(struct ovsdb_trigger *t)
 {
-    ovs_assert(reply && !t->reply);
-    t->reply = reply;
+    ovs_assert(t->reply);
     ovs_list_remove(&t->node);
     ovs_list_push_back(&t->session->completions, &t->node);
 }
@@ -203,14 +319,16 @@  ovsdb_trigger_complete(struct ovsdb_trigger *t, struct jsonrpc_msg *reply)
 static void
 trigger_error(struct ovsdb_trigger *t, struct ovsdb_error *error)
 {
-    struct jsonrpc_msg *reply = jsonrpc_create_error(
+    ovs_assert(error && !t->reply);
+    t->reply = jsonrpc_create_error(
         ovsdb_error_to_json_free(error), t->request->id);
-    ovsdb_trigger_complete(t, reply);
+    ovsdb_trigger_complete(t);
 }
 
 static void
 trigger_success(struct ovsdb_trigger *t, struct json *result)
 {
-    struct jsonrpc_msg *reply = jsonrpc_create_reply(result, t->request->id);
-    ovsdb_trigger_complete(t, reply);
+    ovs_assert(result && !t->reply);
+    t->reply = jsonrpc_create_reply(result, t->request->id);
+    ovsdb_trigger_complete(t);
 }
diff --git a/ovsdb/trigger.h b/ovsdb/trigger.h
index d9df97f31222..a9fed829b866 100644
--- a/ovsdb/trigger.h
+++ b/ovsdb/trigger.h
@@ -20,13 +20,35 @@ 
 
 struct ovsdb;
 
+/* Triggers have the following states:
+ *
+ *    - Initialized (reply == NULL, progress == NULL): Executing the trigger
+ *      can keep it in the initialized state, if it has a "wait" condition that
+ *      isn't met.  Executing the trigger can also yield an error, in which
+ *      case it transition to "complete".  Otherwise, execution yields a
+ *      transaction, which the database attempts to commit.  If the transaction
+ *      completes immediately and synchronously, then the trigger transitions
+ *      to the "complete" state.  If the transaction requires some time to
+ *      complete, it transitions to the "committing" state.
+ *
+ *    - Committing (reply != NULL, progress != NULL): The transaction is
+ *      committing.  If it succeeds, or if it fails permanently, then the
+ *      trigger transitions to "complete".  If it fails temporarily
+ *      (e.g. because someone else committed to cluster-based storage before we
+ *      did), then we transition back to "initialized" to try again.
+ *
+ *    - Complete (reply != NULL, progress == NULL): The transaction is done
+ *      and either succeeded or failed.
+ */
 struct ovsdb_trigger {
+    /* In "initialized" or "committing" state, in db->triggers.
+     * In "complete", in session->completions. */
+    struct ovs_list node;
     struct ovsdb_session *session; /* Session that owns this trigger. */
     struct ovsdb *db;           /* Database on which trigger acts. */
-    struct ovs_list node;       /* !result: in db->triggers;
-                                 * result: in session->completions. */
     struct jsonrpc_msg *request; /* Database request. */
     struct jsonrpc_msg *reply;   /* Result (null if none yet).. */
+    struct ovsdb_txn_progress *progress;
     long long int created;      /* Time created. */
     long long int timeout_msec; /* Max wait duration. */
     bool read_only;             /* Database is in read only mode. */
diff --git a/tests/.gitignore b/tests/.gitignore
index 294e6fb6dafa..3e2ddf2e9e5d 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -34,6 +34,7 @@ 
 /test-ofpbuf
 /test-ovsdb
 /test-packets
+/test-raft
 /test-random
 /test-reconnect
 /test-rstp
diff --git a/tests/automake.mk b/tests/automake.mk
index 3cfc0e9bdccb..5782e98097f0 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -12,7 +12,12 @@  EXTRA_DIST += \
 	tests/atlocal.in \
 	$(srcdir)/package.m4 \
 	$(srcdir)/tests/testsuite \
-	$(srcdir)/tests/testsuite.patch
+	$(srcdir)/tests/testsuite.patch \
+	tests/test-raft.sh \
+	tests/test-raft2.sh \
+	tests/test-raft3.sh \
+	tests/test-raft4.sh \
+	tests/torture-raft4.sh
 
 COMMON_MACROS_AT = \
 	tests/ovsdb-macros.at \
@@ -86,6 +91,7 @@  TESTSUITE_AT = \
 	tests/ovsdb-idl.at \
 	tests/ovsdb-lock.at \
 	tests/ovsdb-rbac.at \
+	tests/ovsdb-cluster.at \
 	tests/ovs-vsctl.at \
 	tests/ovs-xapi-sync.at \
 	tests/stp.at \
@@ -294,6 +300,10 @@  nodist_tests_test_ovsdb_SOURCES = tests/idltest.c tests/idltest.h
 EXTRA_DIST += tests/uuidfilt.pl tests/ovsdb-monitor-sort.pl
 tests_test_ovsdb_LDADD = ovsdb/libovsdb.la lib/libopenvswitch.la
 
+noinst_PROGRAMS += tests/test-raft
+tests_test_raft_SOURCES = tests/test-raft.c
+tests_test_raft_LDADD = ovsdb/libovsdb.la lib/libopenvswitch.la
+
 noinst_PROGRAMS += tests/test-lib
 tests_test_lib_SOURCES = \
 	tests/test-lib.c
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
new file mode 100644
index 000000000000..84f5de1ab255
--- /dev/null
+++ b/tests/ovsdb-cluster.at
@@ -0,0 +1,78 @@ 
+AT_BANNER([OVSDB -- clustered transactions (1 server)])
+
+# OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS])
+#
+# Creates a clustered database with the given SCHEMA and a single
+# server, starts an ovsdb-server on that database, and runs each of
+# the TRANSACTIONS (which should be a quoted list of quoted strings)
+# against it with ovsdb-client one at a time.
+#
+# Checks that the overall output is OUTPUT, but UUIDs in the output
+# are replaced by markers of the form <N> where N is a number.  The
+# first unique UUID is replaced by <0>, the next by <1>, and so on.
+# If a given UUID appears more than once it is always replaced by the
+# same marker.
+#
+# TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS.
+m4_define([OVSDB_CHECK_EXECUTION],
+  [AT_SETUP([$1])
+   AT_KEYWORDS([ovsdb server positive unix cluster1 $5])
+   $2 > schema
+   AT_CHECK([ovsdb-tool create-cluster db schema unix:cluster], [0], [stdout], [ignore])
+   AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --log-file --remote=punix:socket db], [0], [ignore], [ignore])
+   sleep 2
+   m4_foreach([txn], [$3],
+     [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], [stdout], [ignore],
+     [test ! -e pid || kill `cat pid`])
+cat stdout >> output
+])
+   AT_CHECK([${PERL} $srcdir/uuidfilt.pl output], [0], [$4], [ignore],
+            [test ! -e pid || kill `cat pid`])
+   OVSDB_SERVER_SHUTDOWN
+   AT_CLEANUP])
+
+EXECUTION_EXAMPLES
+
+AT_BANNER([OVSDB -- clustered transactions (3 servers)])
+
+# OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS])
+#
+# Creates a clustered database with the given SCHEMA and a single
+# server, starts an ovsdb-server on that database, and runs each of
+# the TRANSACTIONS (which should be a quoted list of quoted strings)
+# against it with ovsdb-client one at a time.
+#
+# Checks that the overall output is OUTPUT, but UUIDs in the output
+# are replaced by markers of the form <N> where N is a number.  The
+# first unique UUID is replaced by <0>, the next by <1>, and so on.
+# If a given UUID appears more than once it is always replaced by the
+# same marker.
+#
+# TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS.
+m4_define([OVSDB_CHECK_EXECUTION],
+  [AT_SETUP([$1])
+   AT_KEYWORDS([ovsdb server positive unix cluster cluster3 $5])
+   $2 > schema
+   schema=`ovsdb-tool schema-name schema`
+   AT_CHECK([ovsdb-tool create-cluster s1.db schema unix:s1.raft])
+   cid=`ovsdb-tool db-cid s1.db`
+   AT_CHECK([ovsdb-tool join-cluster s2.db $schema unix:s2.raft unix:s1.raft])
+   AT_CHECK([ovsdb-tool join-cluster s3.db $schema unix:s3.raft unix:s1.raft])
+
+   on_exit 'kill `cat *.pid`'
+   for i in 1 2 3; do
+       AT_CHECK([ovsdb-server -vjsonrpc -vconsole:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --remote=punix:s$i.ovsdb s$i.db])
+   done
+   for i in 1 2 3; do
+       AT_CHECK([ovsdb-client wait unix:s$i.ovsdb $schema connected])
+   done
+
+   m4_foreach([txn], [$3],
+     [AT_CHECK([ovsdb-client -vjsonrpc -vconsole:off -vvlog:off --log-file transact unix:s1.ovsdb 'txn'], [0], [stdout])
+cat stdout >> output
+])
+   AT_CHECK([${PERL} $srcdir/uuidfilt.pl output], [0], [$4], [ignore],
+            [test ! -e pid || kill `cat pid`])
+   AT_CLEANUP])
+
+EXECUTION_EXAMPLES
diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at
index a5f75febf949..6500cc87a8d3 100644
--- a/tests/ovsdb-idl.at
+++ b/tests/ovsdb-idl.at
@@ -792,7 +792,7 @@  test-ovsdb|ovsdb_idl|link1 table in idltest database lacks l2 column (database n
 # Check that ovsdb-idl sent on "monitor" request and that it didn't
 # mention that table or column, and (for paranoia) that it did mention another
 # table and column.
-AT_CHECK([grep -c '"monitor\|monitor_cond"' stderr], [0], [1
+AT_CHECK([grep -c '"monitor\|monitor_cond"' stderr], [0], [2
 ])
 AT_CHECK([grep '"monitor\|monitor_cond"' stderr | grep link2], [1])
 AT_CHECK([grep '"monitor\|monitor_cond"' stderr | grep l2], [1])
diff --git a/tests/ovsdb-log.at b/tests/ovsdb-log.at
index d295cedd639f..e3179f708a40 100644
--- a/tests/ovsdb-log.at
+++ b/tests/ovsdb-log.at
@@ -95,7 +95,7 @@  file: read: end of log
 ]], [ignore])
 AT_CHECK(
   [test-ovsdb log-io file read-only], [1], [],
-  [test-ovsdb: ovsdb error: file: bad magic (unexpected kind of file)
+  [test-ovsdb: ovsdb error: file: unexpected file format
 ])
 AT_CHECK([test -f .file.~lock~])
 AT_CLEANUP
diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at
index 280e37fd8525..e0dceed804ea 100644
--- a/tests/ovsdb-server.at
+++ b/tests/ovsdb-server.at
@@ -211,7 +211,7 @@  ovs-appctl: ovsdb-server: server returned an error
 ])
 else
   AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], 2, [],
-  [db2: already open
+  [ovsdb error: db2: already open
 ovs-appctl: ovsdb-server: server returned an error
 ])
 fi
@@ -285,7 +285,7 @@  AT_SKIP_IF([test "$IS_WIN32" = "yes"])
 ordinal_schema > schema
 AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore])
 on_exit 'kill `cat *.pid`'
-AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1])
+AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1])
 
 # Add the second database.
 constraint_schema > schema2
@@ -318,7 +318,7 @@  AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore])
 constraint_schema > schema2
 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore])
 on_exit 'kill `cat *.pid`'
-AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2])
+AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2])
 
 # Remove the second database.
 AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints])
@@ -462,7 +462,7 @@  AT_SKIP_IF([test "$IS_WIN32" = "yes"])
 ordinal_schema > schema
 AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore])
 on_exit 'kill `cat *.pid`'
-AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db])
+AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db])
 
 # Add a remote.
 AT_CHECK([test ! -e socket1])
@@ -493,7 +493,7 @@  AT_SKIP_IF([test "$IS_WIN32" = "yes"])
 ordinal_schema > schema
 AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore])
 on_exit 'kill `cat *.pid`'
-AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db])
+AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db])
 
 # Add a remote.
 AT_CHECK([test ! -e socket1])
@@ -899,9 +899,8 @@  AT_CHECK([ovsdb-client convert new-schema])
 
 dnl Verify that the "ordinals" monitors behaved as they should have.
 dnl Both should have exited, for different reasons.
-dnl The db-aware _Server monitor should still be running, but not the unaware
-dnl one.
-for x in unaware aware; do
+for x in aware unaware; do
+    echo $x
     OVS_WAIT_WHILE([test -e monitor-ordinals-$x.pid])
     AT_CHECK([sort -k 3 monitor-ordinals-$x.stdout | ${PERL} $srcdir/uuidfilt.pl], [0],
 [<0> initial 0 zero
diff --git a/tests/ovsdb.at b/tests/ovsdb.at
index a38abd858272..f109b79b60a1 100644
--- a/tests/ovsdb.at
+++ b/tests/ovsdb.at
@@ -150,3 +150,4 @@  m4_include([tests/ovsdb-monitor.at])
 m4_include([tests/ovsdb-idl.at])
 m4_include([tests/ovsdb-lock.at])
 m4_include([tests/ovsdb-rbac.at])
+m4_include([tests/ovsdb-cluster.at])
diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c
index 1b0ec094c5e1..ef3d4b0aef8d 100644
--- a/tests/test-ovsdb.c
+++ b/tests/test-ovsdb.c
@@ -40,6 +40,7 @@ 
 #include "ovsdb/query.h"
 #include "ovsdb/row.h"
 #include "ovsdb/server.h"
+#include "ovsdb/storage.h"
 #include "ovsdb/table.h"
 #include "ovsdb/transaction.h"
 #include "ovsdb/trigger.h"
@@ -1441,7 +1442,7 @@  do_execute__(struct ovs_cmdl_context *ctx, bool ro)
     json = parse_json(ctx->argv[1]);
     check_ovsdb_error(ovsdb_schema_from_json(json, &schema));
     json_destroy(json);
-    db = ovsdb_create(schema);
+    db = ovsdb_create(schema, ovsdb_storage_create_unbacked());
 
     for (i = 2; i < ctx->argc; i++) {
         struct json *params, *result;
@@ -1507,7 +1508,7 @@  do_trigger(struct ovs_cmdl_context *ctx)
     json = parse_json(ctx->argv[1]);
     check_ovsdb_error(ovsdb_schema_from_json(json, &schema));
     json_destroy(json);
-    db = ovsdb_create(schema);
+    db = ovsdb_create(schema, ovsdb_storage_create_unbacked());
 
     ovsdb_server_init(&server);
     ovsdb_server_add_db(&server, db);
@@ -1570,7 +1571,7 @@  static struct ovsdb_table *do_transact_table;
 static void
 do_transact_commit(struct ovs_cmdl_context *ctx OVS_UNUSED)
 {
-    ovsdb_error_destroy(ovsdb_txn_commit(do_transact_txn, false));
+    ovsdb_error_destroy(ovsdb_txn_replay_commit(do_transact_txn));
     do_transact_txn = NULL;
 }
 
@@ -1737,7 +1738,7 @@  do_transact(struct ovs_cmdl_context *ctx)
                       "       \"j\": {\"type\": \"integer\"}}}}}");
     check_ovsdb_error(ovsdb_schema_from_json(json, &schema));
     json_destroy(json);
-    do_transact_db = ovsdb_create(schema);
+    do_transact_db = ovsdb_create(schema, ovsdb_storage_create_unbacked());
     do_transact_table = ovsdb_get_table(do_transact_db, "mytable");
     ovs_assert(do_transact_table != NULL);
 
diff --git a/tests/test-raft.c b/tests/test-raft.c
new file mode 100644
index 000000000000..0945349fa7ec
--- /dev/null
+++ b/tests/test-raft.c
@@ -0,0 +1,303 @@ 
+/*
+ * Copyright (c) 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <getopt.h>
+#include "ovsdb/raft.h"
+#include <stdio.h>
+#include "command-line.h"
+#include "daemon.h"
+#include "fatal-signal.h"
+#include "openvswitch/json.h"
+#include "ovsdb-error.h"
+#include "poll-loop.h"
+#include "unixctl.h"
+#include "util.h"
+#include "uuid.h"
+#include "openvswitch/vlog.h"
+
+struct execute_command {
+    struct ovs_list list_node;
+    struct raft_command *cmd;
+    struct unixctl_conn *conn;
+};
+
+struct execute_ctx {
+    struct raft *raft;
+    struct ovs_list commands;
+};
+
+OVS_NO_RETURN static void usage(void);
+static void parse_options(int argc, char *argv[], char **unixctl_pathp);
+
+static unixctl_cb_func test_raft_exit;
+static unixctl_cb_func test_raft_execute;
+static unixctl_cb_func test_raft_leave;
+static unixctl_cb_func test_raft_take_leadership;
+static unixctl_cb_func test_raft_transfer_leadership;
+static unixctl_cb_func test_raft_store_snapshot;
+
+static void
+check_ovsdb_error(struct ovsdb_error *error)
+{
+    if (error) {
+        char *s = ovsdb_error_to_string(error);
+        ovsdb_error_destroy(error);
+        ovs_fatal(0, "%s", s);
+    }
+}
+
+int
+main(int argc, char *argv[])
+{
+    char *unixctl_pathp = NULL;
+    set_program_name(argv[0]);
+    service_start(&argc, &argv);
+    fatal_signal_init();
+    parse_options(argc, argv, &unixctl_pathp);
+
+    argc -= optind;
+    argv += optind;
+    if (argc == 0 || argc == 2) {
+        ovs_fatal(0, "either one or more than two non-option arguments "
+                  " required (use --help for help)");
+    }
+
+    daemonize_start(false);
+
+    struct raft *raft;
+    const char *file_name = argv[0];
+    check_ovsdb_error(raft_open(file_name, &raft));
+
+    struct unixctl_server *server;
+    int error = unixctl_server_create(unixctl_pathp, &server);
+    if (error) {
+        ovs_fatal(error, "failed to create unixctl server");
+    }
+
+    bool exiting = false;
+    unixctl_command_register("exit", "", 0, 0, test_raft_exit, &exiting);
+
+    struct execute_ctx ec = { raft, OVS_LIST_INITIALIZER(&ec.commands) };
+    unixctl_command_register("execute", "DATA", 1, 1, test_raft_execute, &ec);
+
+    unixctl_command_register("leave", "", 0, 0, test_raft_leave, raft);
+    unixctl_command_register("take-leadership", "", 0, 0,
+                             test_raft_take_leadership, raft);
+    unixctl_command_register("transfer-leadership", "", 0, 0,
+                             test_raft_transfer_leadership, raft);
+    unixctl_command_register("store-snapshot", "SNAPSHOT", 1, 1,
+                             test_raft_store_snapshot, raft);
+
+    daemonize_complete();
+
+    for (;;) {
+        unixctl_server_run(server);
+
+        raft_run(raft);
+        if (raft_left(raft)) {
+            break;
+        }
+        while (raft_has_next_entry(raft)) {
+            const struct json *entry;
+            struct uuid eid;
+            bool snapshot;
+
+            entry = raft_next_entry(raft, &eid, &snapshot);
+            char *entry_s = json_to_string(entry, JSSF_SORT);
+            if (snapshot) {
+                printf("new snapshot \"%s\"\n", entry_s);
+            } else {
+                printf("applying entry \"%s\"\n", entry_s);
+            }
+            free(entry_s);
+        }
+
+        if (exiting) {
+            break;
+        }
+
+        struct execute_command *c;
+        LIST_FOR_EACH (c, list_node, &ec.commands) {
+            enum raft_command_status status = raft_command_get_status(c->cmd);
+            if (status != RAFT_CMD_INCOMPLETE) {
+                unixctl_command_reply(c->conn,
+                                      raft_command_status_to_string(status));
+                raft_command_unref(c->cmd);
+                ovs_list_remove(&c->list_node);
+                free(c);
+            }
+        }
+
+        unixctl_server_wait(server);
+        raft_wait(raft);
+        LIST_FOR_EACH (c, list_node, &ec.commands) {
+            raft_command_wait(c->cmd);
+        }
+        poll_block();
+    }
+    unixctl_server_destroy(server);
+    raft_close(raft);
+
+    return 0;
+}
+
+static void
+parse_options(int argc, char *argv[], char **unixctl_pathp)
+{
+    enum {
+        OPT_CLUSTER = UCHAR_MAX + 1,
+        OPT_UNIXCTL,
+        DAEMON_OPTION_ENUMS,
+        VLOG_OPTION_ENUMS
+    };
+    static const struct option long_options[] = {
+        {"cluster", required_argument, NULL, OPT_CLUSTER},
+        {"unixctl", required_argument, NULL, OPT_UNIXCTL},
+        {"help", no_argument, NULL, 'h'},
+        DAEMON_LONG_OPTIONS,
+        VLOG_LONG_OPTIONS,
+        {NULL, 0, NULL, 0},
+    };
+    char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
+
+    for (;;) {
+        int c = getopt_long(argc, argv, short_options, long_options, NULL);
+        if (c == -1) {
+            break;
+        }
+
+        switch (c) {
+        case OPT_UNIXCTL:
+            *unixctl_pathp = optarg;
+            break;
+
+        case 'h':
+            usage();
+
+        DAEMON_OPTION_HANDLERS
+        VLOG_OPTION_HANDLERS
+
+        case '?':
+            exit(EXIT_FAILURE);
+
+        default:
+            abort();
+        }
+    }
+    free(short_options);
+}
+
+static void
+usage(void)
+{
+    printf("%s: Raft implementation test utility\n"
+           "usage: %s [OPTIONS] LOG\n"
+           "where LOG is the Raft log file.\n",
+           program_name, program_name);
+    daemon_usage();
+    vlog_usage();
+    printf("\nOther options:\n"
+           "  --cluster=UUID          force cluster ID\n"
+           "  --unixctl=SOCKET        override default control socket name\n"
+           "  -h, --help              display this help message\n");
+    exit(EXIT_SUCCESS);
+}
+
+static void
+test_raft_exit(struct unixctl_conn *conn,
+                  int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
+                  void *exiting_)
+{
+    bool *exiting = exiting_;
+    *exiting = true;
+    unixctl_command_reply(conn, NULL);
+}
+
+static void
+test_raft_execute(struct unixctl_conn *conn,
+                  int argc OVS_UNUSED, const char *argv[],
+                  void *ctx_)
+{
+    struct json *data = json_from_string(argv[1]);
+    if (data->type == JSON_STRING) {
+        unixctl_command_reply_error(conn, json_string(data));
+    } else {
+        struct execute_ctx *ctx = ctx_;
+        struct execute_command *command = xmalloc(sizeof *command);
+        ovs_list_push_back(&ctx->commands, &command->list_node);
+        command->cmd = raft_command_execute(ctx->raft, data, NULL, NULL);
+        command->conn = conn;
+    }
+    json_destroy(data);
+}
+
+
+static void
+test_raft_take_leadership(struct unixctl_conn *conn,
+                          int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
+                          void *raft_)
+{
+    struct raft *raft = raft_;
+    raft_take_leadership(raft);
+    unixctl_command_reply(conn, NULL);
+}
+
+static void
+test_raft_transfer_leadership(struct unixctl_conn *conn, int argc OVS_UNUSED,
+                              const char *argv[] OVS_UNUSED, void *raft_)
+{
+    struct raft *raft = raft_;
+    raft_transfer_leadership(raft);
+    unixctl_command_reply(conn, NULL);
+}
+
+static void
+test_raft_store_snapshot(struct unixctl_conn *conn,
+                         int argc OVS_UNUSED, const char *argv[],
+                         void *raft_)
+{
+    struct json *data = json_from_string(argv[1]);
+    if (data->type == JSON_STRING) {
+        unixctl_command_reply_error(conn, json_string(data));
+    } else {
+        struct raft *raft = raft_;
+        char *s = ovsdb_error_to_string_free(raft_store_snapshot(raft, data));
+        if (s) {
+            unixctl_command_reply_error(conn, s);
+            free(s);
+        } else {
+            unixctl_command_reply(conn, NULL);
+        }
+    }
+    json_destroy(data);
+}
+
+static void
+test_raft_leave(struct unixctl_conn *conn,
+                int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
+                void *raft_)
+{
+    struct raft *raft = raft_;
+    if (raft_is_joining(raft)) {
+        unixctl_command_reply_error(conn, "cannot leave while joining");
+    } else if (raft_is_leaving(raft)) {
+        unixctl_command_reply_error(conn, "already leaving");
+    } else {
+        raft_leave(raft);
+        unixctl_command_reply(conn, NULL);
+    }
+}
diff --git a/tests/test-raft.sh b/tests/test-raft.sh
new file mode 100755
index 000000000000..b2f2705416d1
--- /dev/null
+++ b/tests/test-raft.sh
@@ -0,0 +1,13 @@ 
+#! /bin/sh -ex
+rm -f s1.db s2.db s3.db
+export OVS_RUNDIR=$PWD
+schema=../ovn/ovn-sb.ovsschema
+schema_name=`ovsdb/ovsdb-tool schema-name $schema`
+ovsdb/ovsdb-tool create-cluster s1.db $schema unix:s1.sock
+ovsdb/ovsdb-tool join-cluster s2.db $schema_name unix:s2.sock unix:s1.sock
+ovsdb/ovsdb-tool join-cluster s3.db $schema_name unix:s3.sock unix:s1.sock
+
+xterm -geometry 132x25-0+0 -T 1 -e tests/test-raft s1.db &
+xterm -geometry 132x25-0+350 -T 2 -e tests/test-raft s2.db &
+xterm -geometry 132x25-0+700 -T 3 -e tests/test-raft s3.db &
+wait
diff --git a/tests/test-raft2.sh b/tests/test-raft2.sh
new file mode 100755
index 000000000000..1b63b2395937
--- /dev/null
+++ b/tests/test-raft2.sh
@@ -0,0 +1,12 @@ 
+#! /bin/sh -ex
+rm -f s1.db s2.db s3.db
+export OVS_RUNDIR=$PWD
+schema=../ovn/ovn-sb.ovsschema
+schema_name=`ovsdb/ovsdb-tool schema-name $schema`
+ovsdb/ovsdb-tool create-cluster s1.db $schema tcp:127.0.0.1:6641
+valgrind ovsdb/ovsdb-server s1.db --remote=punix:db.sock &
+read line
+kill $!
+wait
+valgrind ovsdb/ovsdb-server s1.db --remote=punix:db.sock
+
diff --git a/tests/test-raft3.sh b/tests/test-raft3.sh
new file mode 100755
index 000000000000..5dc938993064
--- /dev/null
+++ b/tests/test-raft3.sh
@@ -0,0 +1,14 @@ 
+#! /bin/sh -ex
+rm -f s1.db s2.db s3.db
+export OVS_RUNDIR=$PWD
+schema=../ovn/ovn-sb.ovsschema
+schema_name=`ovsdb/ovsdb-tool schema-name $schema`
+ovsdb/ovsdb-tool create-cluster s1.db $schema tcp:127.0.0.1:6641
+ovsdb/ovsdb-tool join-cluster s2.db $schema_name tcp:127.0.0.1:6642 tcp:127.0.0.1:6641
+ovsdb/ovsdb-tool join-cluster s3.db $schema_name tcp:127.0.0.1:6643 tcp:127.0.0.1:6642
+
+VALGRIND=valgrind
+xterm -geometry 132x25-0+0 -T 1 -e $VALGRIND ovsdb/ovsdb-server s1.db --remote=punix:db1.sock &
+xterm -geometry 132x25-0+350 -T 2 -e $VALGRIND ovsdb/ovsdb-server s2.db --remote=punix:db2.sock &
+xterm -geometry 132x25-0+700 -T 3 -e $VALGRIND ovsdb/ovsdb-server s3.db --remote=punix:db3.sock &
+wait
diff --git a/tests/test-raft4.sh b/tests/test-raft4.sh
new file mode 100755
index 000000000000..a50919ae2337
--- /dev/null
+++ b/tests/test-raft4.sh
@@ -0,0 +1,37 @@ 
+#! /bin/sh -x
+rm -f s?.db
+rm -f s?.log
+rm -f s?.valgrind
+export OVS_RUNDIR=$PWD
+schema=ovn/ovn-sb.ovsschema
+schema_name=`ovsdb/ovsdb-tool schema-name $schema`
+
+ovsdb/ovsdb-tool create-cluster s1.db $schema unix:s1.raft
+ovsdb/ovsdb-tool join-cluster s2.db $schema_name unix:s2.raft unix:s1.raft
+ovsdb/ovsdb-tool join-cluster s3.db $schema_name unix:s3.raft unix:s1.raft
+ovsdb/ovsdb-tool join-cluster s4.db $schema_name unix:s4.raft unix:s1.raft
+
+wrapper () {
+    echo "valgrind --log-file=s$1.valgrind"
+}
+
+OPTIONS=-vfile
+xterm -geometry 80x25-0+0 -T 1 -e `wrapper 1` ovsdb/ovsdb-server --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb $OPTIONS s1.db &
+xterm -geometry 80x25-0+350 -T 2 -e `wrapper 2` ovsdb/ovsdb-server --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb $OPTIONS s2.db &
+xterm -geometry 80x25-0+700 -T 3 -e `wrapper 3` ovsdb/ovsdb-server --log-file=s3.log --pidfile=s3.pid --unixctl=s3 --remote=punix:s3.ovsdb $OPTIONS s3.db &
+
+read line
+
+xterm -geometry 80x25-490+0 -T 4 -e `wrapper 4` ovsdb/ovsdb-server --log-file=s4.log --pidfile=s4.pid --unixctl=s4 --remote=punix:s4.ovsdb $OPTIONS s4.db &
+
+read line
+
+ovs-appctl -t `pwd`/s2 cluster/leave OVN_Southbound
+
+read line
+
+kill `cat s2.pid`
+
+read line
+
+xterm -geometry 80x25-0+350 -T 2 -e ovsdb/ovsdb-server --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb s2.db &
diff --git a/tests/torture-raft4.sh b/tests/torture-raft4.sh
new file mode 100755
index 000000000000..16d130d3ed98
--- /dev/null
+++ b/tests/torture-raft4.sh
@@ -0,0 +1,23 @@ 
+#! /bin/sh -ex
+
+export OVS_RUNDIR=$PWD
+export OVN_SB_DB=unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb
+PATH=$PATH:$PWD/ovn/utilities
+for i in `seq 0 9`; do
+    for j in `seq 5`; do
+	echo "$i-$j=$i-$j" >> expected
+    done
+done > expected
+rm -f ?-?.log
+for i in `seq 0 9`; do
+    (for j in `seq 5`; do
+	 ovn-sbctl --log-file=$i-$j.log -vfile add SB_Global . external_ids $i-$j=$i-$j
+     done)&
+done
+sleep 2
+kill `cat s1.pid` || true
+wait
+ovn-sbctl --bare get SB_Global . external-ids | sed 's/, /\n/g; s/[{}"]//g;' > output
+if diff -u expected output; then
+    echo "success"
+fi
diff --git a/tutorial/ovs-sandbox b/tutorial/ovs-sandbox
index 258ea9b87db6..a070394d3395 100755
--- a/tutorial/ovs-sandbox
+++ b/tutorial/ovs-sandbox
@@ -393,7 +393,7 @@  if test ! -e "$sandbox"/db.sock; then
 fi
 
 # Initialize database.
-run ovs-vsctl --no-wait -- init
+run ovs-vsctl -vjsonrpc -vovsdb_idl --no-wait -- init
 
 # Start ovs-vswitchd.
 rungdb $gdb_vswitchd $gdb_vswitchd_ex ovs-vswitchd --detach --no-chdir --pidfile -vconsole:off --log-file \