From patchwork Mon Jan 1 05:16:26 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854284 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z955y0bfSz9t84 for ; Mon, 1 Jan 2018 16:16:53 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 8AFBABC6; Mon, 1 Jan 2018 05:16:50 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 67A40BC3 for ; Mon, 1 Jan 2018 05:16:49 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 6B70114B for ; Mon, 1 Jan 2018 05:16:48 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id CE4FDC5A44; Mon, 1 Jan 2018 06:16:45 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:26 -0800 Message-Id: <20180101051640.13043-1-blp@ovn.org> X-Mailer: git-send-email 2.10.2 X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 01/15] log: Add async commit support. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org The OVSDB log code has always had the ability to commit the log to disk and wait for the commit to finish. This patch introduces a new feature that allows the client to start a commit in the background and then to determine asynchronously that the commit has completed. This will be especially useful later for the distributed database feature. Signed-off-by: Ben Pfaff Reviewed-by: Yifeng Sun Reviewed-by: Yifeng Sun --- ovsdb/file.c | 4 +- ovsdb/log.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++-- ovsdb/log.h | 7 ++- ovsdb/ovsdb-tool.c | 2 +- tests/test-ovsdb.c | 2 +- 5 files changed, 158 insertions(+), 9 deletions(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index 90c2b9d20a9a..fdd5f8e35a44 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -661,7 +661,7 @@ ovsdb_file_compact(struct ovsdb_file *file) /* Commit the old version, so that we can be assured that we'll eventually * have either the old or the new version. */ - error = ovsdb_log_commit(file->log); + error = ovsdb_log_commit_block(file->log); if (error) { goto exit; } @@ -857,7 +857,7 @@ ovsdb_file_txn_commit(struct json *json, const char *comment, } if (durable) { - error = ovsdb_log_commit(log); + error = ovsdb_log_commit_block(log); if (error) { return ovsdb_wrap_error(error, "committing transaction failed"); } diff --git a/ovsdb/log.c b/ovsdb/log.c index 0f8dafa30a8f..cc4bc2c6243e 100644 --- a/ovsdb/log.c +++ b/ovsdb/log.c @@ -24,12 +24,17 @@ #include #include +#include "lockfile.h" #include "openvswitch/dynamic-string.h" #include "openvswitch/json.h" #include "openvswitch/vlog.h" -#include "lockfile.h" -#include "ovsdb.h" +#include "ovs-atomic.h" +#include "ovs-rcu.h" +#include "ovs-thread.h" #include "ovsdb-error.h" +#include "ovsdb.h" +#include "openvswitch/poll-loop.h" +#include "seq.h" #include "sha1.h" #include "socket-util.h" #include "transaction.h" @@ -78,6 +83,7 @@ struct ovsdb_log { struct lockfile *lockfile; FILE *stream; off_t base; + struct afsync *afsync; }; /* Whether the OS supports renaming open files. @@ -95,6 +101,9 @@ static bool parse_header(char *header, const char **magicp, uint8_t sha1[SHA1_DIGEST_SIZE]); static bool is_magic_ok(const char *needle, const char *haystack); +static struct afsync *afsync_create(int fd, uint64_t initial_ticket); +static uint64_t afsync_destroy(struct afsync *); + /* Attempts to open 'name' with the specified 'open_mode'. On success, stores * the new log into '*filep' and returns NULL; otherwise returns NULL and * stores NULL into '*filep'. @@ -269,6 +278,7 @@ ovsdb_log_open(const char *name, const char *magic, file->prev_offset = 0; file->offset = 0; file->base = 0; + file->afsync = NULL; *filep = file; return NULL; @@ -308,6 +318,7 @@ ovsdb_log_close(struct ovsdb_log *file) { if (file) { ovsdb_error_destroy(file->error); + afsync_destroy(file->afsync); free(file->name); free(file->display_name); free(file->magic); @@ -634,8 +645,10 @@ ovsdb_log_write(struct ovsdb_log *file, const struct json *json) return NULL; } +/* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail. + * Returns NULL if successful, otherwise the error that occurred. */ struct ovsdb_error * -ovsdb_log_commit(struct ovsdb_log *file) +ovsdb_log_commit_block(struct ovsdb_log *file) { if (file->stream && fsync(fileno(file->stream))) { return ovsdb_io_error(errno, "%s: fsync failed", file->display_name); @@ -740,7 +753,7 @@ ovsdb_rename(const char *old, const char *new) struct ovsdb_error * OVS_WARN_UNUSED_RESULT ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new) { - struct ovsdb_error *error = ovsdb_log_commit(new); + struct ovsdb_error *error = ovsdb_log_commit_block(new); if (error) { ovsdb_log_replace_abort(new); return error; @@ -812,6 +825,10 @@ ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new) ovsdb_error_destroy(old->error); old->error = NULL; /* prev_offset only matters for OVSDB_LOG_READ. */ + if (old->afsync) { + uint64_t ticket = afsync_destroy(old->afsync); + old->afsync = afsync_create(fileno(old->stream), ticket + 1); + } old->offset = new->offset; /* Keep old->name. */ free(old->magic); @@ -844,3 +861,130 @@ ovsdb_log_disable_renaming_open_files(void) { rename_open_files = false; } + +struct afsync { + pthread_t thread; + atomic_uint64_t cur, next; + struct seq *request, *complete; + int fd; +}; + +static void * +afsync_thread(void *afsync_) +{ + struct afsync *afsync = afsync_; + uint64_t cur = 0; + for (;;) { + ovsrcu_quiesce_start(); + + uint64_t request_seq = seq_read(afsync->request); + + uint64_t next; + atomic_read_explicit(&afsync->next, &next, memory_order_acquire); + if (next == UINT64_MAX) { + break; + } + + if (cur != next && afsync->fd != -1) { + int error = fsync(afsync->fd) ? errno : 0; + if (!error) { + cur = next; + atomic_store_explicit(&afsync->cur, cur, memory_order_release); + seq_change(afsync->complete); + } else { + VLOG_WARN("fsync failed (%s)", ovs_strerror(error)); + } + } + + seq_wait(afsync->request, request_seq); + poll_block(); + } + return NULL; +} + +static struct afsync * +afsync_create(int fd, uint64_t initial_ticket) +{ + struct afsync *afsync = xzalloc(sizeof *afsync); + atomic_init(&afsync->cur, initial_ticket); + atomic_init(&afsync->next, initial_ticket); + afsync->request = seq_create(); + afsync->complete = seq_create(); + afsync->thread = ovs_thread_create("log_fsync", afsync_thread, afsync); + afsync->fd = fd; + return afsync; +} + +static uint64_t +afsync_destroy(struct afsync *afsync) +{ + if (!afsync) { + return 0; + } + + uint64_t next; + atomic_read(&afsync->next, &next); + atomic_store(&afsync->next, UINT64_MAX); + seq_change(afsync->request); + xpthread_join(afsync->thread, NULL); + + seq_destroy(afsync->request); + seq_destroy(afsync->complete); + + free(afsync); + + return next; +} + +static struct afsync * +ovsdb_log_get_afsync(struct ovsdb_log *log) +{ + if (!log->afsync) { + log->afsync = afsync_create(log->stream ? fileno(log->stream) : -1, 0); + } + return log->afsync; +} + +/* Starts committing 'log' to disk. Returns a ticket that can be passed to + * ovsdb_log_commit_wait() or compared against the return value of + * ovsdb_log_commit_progress() later. */ +uint64_t +ovsdb_log_commit_start(struct ovsdb_log *log) +{ + struct afsync *afsync = ovsdb_log_get_afsync(log); + + uint64_t orig; + atomic_add_explicit(&afsync->next, 1, &orig, memory_order_acq_rel); + + seq_change(afsync->request); + + return orig + 1; +} + +/* Returns a ticket value that represents the current progress of commits to + * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any + * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit + * X is complete if and only if X <= Y. */ +uint64_t +ovsdb_log_commit_progress(struct ovsdb_log *log) +{ + struct afsync *afsync = ovsdb_log_get_afsync(log); + uint64_t cur; + atomic_read_explicit(&afsync->cur, &cur, memory_order_acquire); + return cur; +} + +/* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log) + * would return at least 'goal'. */ +void +ovsdb_log_commit_wait(struct ovsdb_log *log, uint64_t goal) +{ + struct afsync *afsync = ovsdb_log_get_afsync(log); + uint64_t complete = seq_read(afsync->complete); + uint64_t cur = ovsdb_log_commit_progress(log); + if (cur < goal) { + seq_wait(afsync->complete, complete); + } else { + poll_immediate_wake(); + } +} diff --git a/ovsdb/log.h b/ovsdb/log.h index 18900fa50f44..bd0396f27ea8 100644 --- a/ovsdb/log.h +++ b/ovsdb/log.h @@ -35,6 +35,7 @@ * that compacting is advised. */ +#include #include #include "compiler.h" @@ -70,7 +71,11 @@ void ovsdb_log_compose_record(const struct json *, const char *magic, struct ovsdb_error *ovsdb_log_write(struct ovsdb_log *, const struct json *) OVS_WARN_UNUSED_RESULT; -struct ovsdb_error *ovsdb_log_commit(struct ovsdb_log *) + +uint64_t ovsdb_log_commit_start(struct ovsdb_log *); +uint64_t ovsdb_log_commit_progress(struct ovsdb_log *); +void ovsdb_log_commit_wait(struct ovsdb_log *, uint64_t); +struct ovsdb_error *ovsdb_log_commit_block(struct ovsdb_log *) OVS_WARN_UNUSED_RESULT; void ovsdb_log_mark_base(struct ovsdb_log *); diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index 4343e3ce5b22..cec64152f079 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -222,7 +222,7 @@ do_create(struct ovs_cmdl_context *ctx) check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC, OVSDB_LOG_CREATE_EXCL, -1, &log)); check_ovsdb_error(ovsdb_log_write(log, json)); - check_ovsdb_error(ovsdb_log_commit(log)); + check_ovsdb_error(ovsdb_log_commit_block(log)); ovsdb_log_close(log); json_destroy(json); diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 6b2cde863aba..c0c5a4df51af 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -380,7 +380,7 @@ do_log_io(struct ovs_cmdl_context *ctx) error = ovsdb_log_write(target, json); json_destroy(json); } else if (!strcmp(command, "commit")) { - error = ovsdb_log_commit(target); + error = ovsdb_log_commit_block(target); } else if (!strcmp(command, "replace_start")) { ovs_assert(!replacement); error = ovsdb_log_replace_start(log, &replacement); From patchwork Mon Jan 1 05:16:27 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854285 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z956W2y3Mz9t84 for ; Mon, 1 Jan 2018 16:17:23 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id A5D3CC04; Mon, 1 Jan 2018 05:16:52 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 58D49BE6 for ; Mon, 1 Jan 2018 05:16:51 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 3F76914B for ; Mon, 1 Jan 2018 05:16:50 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 9F26FC5A51; Mon, 1 Jan 2018 06:16:47 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:27 -0800 Message-Id: <20180101051640.13043-2-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 02/15] reconnect: Add ability to do a number of retries without backoff. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org This is aimed at an upcoming database clustering implementation, where it's desirable to try all of the cluster members quickly before backing off to retry them again in sequence. Signed-off-by: Ben Pfaff Acked-by: Russell Bryant Acked-by: Justin Pettit --- lib/reconnect.c | 52 ++++++++++++++++++++++++++++++--------------- lib/reconnect.h | 3 +++ python/ovs/reconnect.py | 53 ++++++++++++++++++++++++++++++---------------- tests/reconnect.at | 56 ++++++++++++++++++++++++++++++++++++++++++++++++- tests/test-reconnect.c | 8 +++++++ tests/test-reconnect.py | 5 +++++ 6 files changed, 141 insertions(+), 36 deletions(-) diff --git a/lib/reconnect.c b/lib/reconnect.c index 04cb15b7ce8c..a61234e20f90 100644 --- a/lib/reconnect.c +++ b/lib/reconnect.c @@ -62,6 +62,7 @@ struct reconnect { long long int last_connected; long long int last_disconnected; unsigned int max_tries; + unsigned int backoff_free_tries; /* These values are simply for statistics reporting, not otherwise used * directly by anything internal. */ @@ -206,6 +207,15 @@ reconnect_get_max_tries(struct reconnect *fsm) return fsm->max_tries; } +/* Sets the number of connection attempts that will be made without backoff to + * 'backoff_free_tries'. Values 0 and 1 both represent a single attempt. */ +void +reconnect_set_backoff_free_tries(struct reconnect *fsm, + unsigned int backoff_free_tries) +{ + fsm->backoff_free_tries = backoff_free_tries; +} + /* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum * number of milliseconds, and 'max_backoff' is the maximum, between connection * attempts. The current backoff is also the duration that 'fsm' is willing to @@ -346,7 +356,7 @@ reconnect_disconnected(struct reconnect *fsm, long long int now, int error) VLOG(fsm->info, "%s: error listening for connections", fsm->name); } - } else { + } else if (fsm->backoff < fsm->max_backoff) { const char *type = fsm->passive ? "listen" : "connection"; if (error > 0) { VLOG_INFO("%s: %s attempt failed (%s)", @@ -359,30 +369,38 @@ reconnect_disconnected(struct reconnect *fsm, long long int now, int error) if (fsm->state & (S_ACTIVE | S_IDLE)) { fsm->last_disconnected = now; } + + if (!reconnect_may_retry(fsm)) { + reconnect_transition__(fsm, now, S_VOID); + return; + } + /* Back off. */ - if (fsm->state & (S_ACTIVE | S_IDLE) - && (fsm->last_activity - fsm->last_connected >= fsm->backoff - || fsm->passive)) { + if (fsm->backoff_free_tries > 1) { + fsm->backoff_free_tries--; + fsm->backoff = 0; + } else if (fsm->state & (S_ACTIVE | S_IDLE) + && (fsm->last_activity - fsm->last_connected >= fsm->backoff + || fsm->passive)) { fsm->backoff = fsm->passive ? 0 : fsm->min_backoff; } else { if (fsm->backoff < fsm->min_backoff) { fsm->backoff = fsm->min_backoff; - } else if (fsm->backoff >= fsm->max_backoff / 2) { - fsm->backoff = fsm->max_backoff; - } else { + } else if (fsm->backoff < fsm->max_backoff / 2) { fsm->backoff *= 2; - } - if (fsm->passive) { - VLOG(fsm->info, "%s: waiting %.3g seconds before trying to " - "listen again", fsm->name, fsm->backoff / 1000.0); + VLOG(fsm->info, "%s: waiting %.3g seconds before %s", + fsm->name, fsm->backoff / 1000.0, + fsm->passive ? "trying to listen again" : "reconnect"); } else { - VLOG(fsm->info, "%s: waiting %.3g seconds before reconnect", - fsm->name, fsm->backoff / 1000.0); + if (fsm->backoff < fsm->max_backoff) { + VLOG_INFO("%s: continuing to %s in the background but " + "suppressing further logging", fsm->name, + fsm->passive ? "try to listen" : "reconnect"); + } + fsm->backoff = fsm->max_backoff; } } - - reconnect_transition__(fsm, now, - reconnect_may_retry(fsm) ? S_BACKOFF : S_VOID); + reconnect_transition__(fsm, now, S_BACKOFF); } } @@ -397,7 +415,7 @@ reconnect_connecting(struct reconnect *fsm, long long int now) if (fsm->state != S_CONNECTING) { if (fsm->passive) { VLOG(fsm->info, "%s: listening...", fsm->name); - } else { + } else if (fsm->backoff < fsm->max_backoff) { VLOG(fsm->info, "%s: connecting...", fsm->name); } reconnect_transition__(fsm, now, S_CONNECTING); diff --git a/lib/reconnect.h b/lib/reconnect.h index 4446713ce873..9f2d469e2ddd 100644 --- a/lib/reconnect.h +++ b/lib/reconnect.h @@ -51,6 +51,8 @@ int reconnect_get_probe_interval(const struct reconnect *); void reconnect_set_max_tries(struct reconnect *, unsigned int max_tries); unsigned int reconnect_get_max_tries(struct reconnect *); +void reconnect_set_backoff_free_tries(struct reconnect *, + unsigned int backoff_free_tries); void reconnect_set_backoff(struct reconnect *, int min_backoff, int max_backoff); @@ -65,6 +67,7 @@ void reconnect_enable(struct reconnect *, long long int now); void reconnect_disable(struct reconnect *, long long int now); void reconnect_force_reconnect(struct reconnect *, long long int now); +void reconnect_skip_backoff(struct reconnect *); bool reconnect_is_connected(const struct reconnect *); unsigned int reconnect_get_last_connect_elapsed(const struct reconnect *, diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py index ec52ebb7affc..34cc76987031 100644 --- a/python/ovs/reconnect.py +++ b/python/ovs/reconnect.py @@ -154,6 +154,7 @@ class Reconnect(object): self.last_connected = None self.last_disconnected = None self.max_tries = None + self.backoff_free_tries = 0 self.creation_time = now self.n_attempted_connections = 0 @@ -242,6 +243,12 @@ class Reconnect(object): self.backoff > self.max_backoff): self.backoff = self.max_backoff + def set_backoff_free_tries(self, backoff_free_tries): + """Sets the number of connection attempts that will be made without + backoff to 'backoff_free_tries'. Values 0 and 1 both + represent a single attempt.""" + self.backoff_free_tries = backoff_free_tries + def set_probe_interval(self, probe_interval): """Sets the "probe interval" to 'probe_interval', in milliseconds. If this is zero, it disables the connection keepalive feature. If it is @@ -337,7 +344,7 @@ class Reconnect(object): else: self.info_level("%s: error listening for connections" % self.name) - else: + elif self.backoff < self.max_backoff: if self.passive: type_ = "listen" else: @@ -352,8 +359,15 @@ class Reconnect(object): if (self.state in (Reconnect.Active, Reconnect.Idle)): self.last_disconnected = now + if not self.__may_retry(): + self._transition(now, Reconnect.Void) + return + # Back off - if (self.state in (Reconnect.Active, Reconnect.Idle) and + if self.backoff_free_tries > 1: + self.backoff_free_tries -= 1 + self.backoff = 0 + elif (self.state in (Reconnect.Active, Reconnect.Idle) and (self.last_activity - self.last_connected >= self.backoff or self.passive)): if self.passive: @@ -363,23 +377,26 @@ class Reconnect(object): else: if self.backoff < self.min_backoff: self.backoff = self.min_backoff - elif self.backoff >= self.max_backoff / 2: - self.backoff = self.max_backoff - else: + elif self.backoff < self.max_backoff / 2: self.backoff *= 2 - - if self.passive: - self.info_level("%s: waiting %.3g seconds before trying " - "to listen again" - % (self.name, self.backoff / 1000.0)) + if self.passive: + action = "trying to listen again" + else: + action = "reconnect" + self.info_level("%s: waiting %.3g seconds before %s" + % (self.name, self.backoff / 1000.0, + action)) else: - self.info_level("%s: waiting %.3g seconds before reconnect" - % (self.name, self.backoff / 1000.0)) - - if self.__may_retry(): - self._transition(now, Reconnect.Backoff) - else: - self._transition(now, Reconnect.Void) + if self.backoff < self.max_backoff: + if self.passive: + action = "try to listen" + else: + action = "reconnect" + self.info_level("%s: continuing to %s in the " + "background but suppressing further " + "logging" % (self.name, action)) + self.backoff = self.max_backoff + self._transition(now, Reconnect.Backoff) def connecting(self, now): """Tell this FSM that a connection or listening attempt is in progress. @@ -390,7 +407,7 @@ class Reconnect(object): if self.state != Reconnect.ConnectInProgress: if self.passive: self.info_level("%s: listening..." % self.name) - else: + elif self.backoff < self.max_backoff: self.info_level("%s: connecting..." % self.name) self._transition(now, Reconnect.ConnectInProgress) diff --git a/tests/reconnect.at b/tests/reconnect.at index c88ca785cad2..59c95d95bdd3 100644 --- a/tests/reconnect.at +++ b/tests/reconnect.at @@ -1037,6 +1037,60 @@ timeout ]) ###################################################################### +RECONNECT_CHECK([backoff-free tries work], + [set-backoff-free-tries 2 +enable + +# Connection fails quickly. +run +connect-failed ECONNREFUSED + +# No backoff. +run +timeout + +# Connection fails quickly again. +run +connect-failed ECONNREFUSED + +# Back off for 1000 ms. +run +timeout +], + [### t=1000 ### +set-backoff-free-tries 2 +enable + in BACKOFF for 0 ms (0 ms backoff) + +# Connection fails quickly. +run + should connect +connect-failed ECONNREFUSED + 0 successful connections out of 1 attempts, seqno 0 + +# No backoff. +run + should connect +timeout + advance 0 ms + +# Connection fails quickly again. +run + should connect +connect-failed ECONNREFUSED + in BACKOFF for 0 ms (1000 ms backoff) + 0 successful connections out of 2 attempts, seqno 0 + +# Back off for 1000 ms. +run +timeout + advance 1000 ms + +### t=2000 ### + in BACKOFF for 1000 ms (1000 ms backoff) +]) + +###################################################################### RECONNECT_CHECK([max-tries of 1 honored], [set-max-tries 1 enable @@ -1090,7 +1144,7 @@ timeout run should disconnect disconnected - in VOID for 0 ms (1000 ms backoff) + in VOID for 0 ms (0 ms backoff) 1 successful connections out of 1 attempts, seqno 2 disconnected disconnected at 11000 ms (0 ms ago) diff --git a/tests/test-reconnect.c b/tests/test-reconnect.c index 72252b8f707b..5a14e7fe58da 100644 --- a/tests/test-reconnect.c +++ b/tests/test-reconnect.c @@ -208,6 +208,12 @@ do_set_max_tries(struct ovs_cmdl_context *ctx) } static void +do_set_backoff_free_tries(struct ovs_cmdl_context *ctx) +{ + reconnect_set_backoff_free_tries(reconnect, atoi(ctx->argv[1])); +} + +static void diff_stats(const struct reconnect_stats *old, const struct reconnect_stats *new, int delta) @@ -284,6 +290,8 @@ static const struct ovs_cmdl_command all_commands[] = { { "advance", NULL, 1, 1, do_advance, OVS_RO }, { "timeout", NULL, 0, 0, do_timeout, OVS_RO }, { "set-max-tries", NULL, 1, 1, do_set_max_tries, OVS_RO }, + { "set-backoff-free-tries", NULL, 1, 1, do_set_backoff_free_tries, + OVS_RO }, { "passive", NULL, 0, 0, do_set_passive, OVS_RO }, { "listening", NULL, 0, 0, do_listening, OVS_RO }, { "listen-error", NULL, 1, 1, do_listen_error, OVS_RO }, diff --git a/tests/test-reconnect.py b/tests/test-reconnect.py index 8132fd9258ef..6cd052878eb1 100644 --- a/tests/test-reconnect.py +++ b/tests/test-reconnect.py @@ -104,6 +104,10 @@ def do_set_max_tries(arg): r.set_max_tries(int(arg)) +def do_set_backoff_free_tries(arg): + r.set_backoff_free_tries(int(arg)) + + def diff_stats(old, new, delta): if (old.state != new.state or old.state_elapsed != new.state_elapsed or @@ -173,6 +177,7 @@ def main(): "advance": do_advance, "timeout": do_timeout, "set-max-tries": do_set_max_tries, + "set-backoff-free-tries": do_set_backoff_free_tries, "passive": do_set_passive, "listening": do_listening, "listen-error": do_listen_error From patchwork Mon Jan 1 05:16:28 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854286 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z957707H9z9t84 for ; Mon, 1 Jan 2018 16:17:55 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 8B214C21; Mon, 1 Jan 2018 05:16:53 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 80C52BC5 for ; Mon, 1 Jan 2018 05:16:52 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id CB13E14B for ; Mon, 1 Jan 2018 05:16:51 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 7F863C5A46; Mon, 1 Jan 2018 06:16:49 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:28 -0800 Message-Id: <20180101051640.13043-3-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 03/15] jsonrpc: Allow jsonrpc_session to have more than one remote. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org The implementation cycles through the remotes in random order. This allows clients to perform some load balancing across alternative implementations of a service. Signed-off-by: Ben Pfaff Acked-by: Russell Bryant Acked-by: Justin Pettit --- lib/jsonrpc.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- lib/jsonrpc.h | 6 +++++- lib/svec.c | 18 ++++++++++++++++++ lib/svec.h | 1 + 4 files changed, 72 insertions(+), 6 deletions(-) diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index 87ca1aa8690c..a8e5bc8434ad 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc. + * Copyright (c) 2009-2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,8 +28,10 @@ #include "openvswitch/ofpbuf.h" #include "ovs-thread.h" #include "openvswitch/poll-loop.h" +#include "random.h" #include "reconnect.h" #include "stream.h" +#include "svec.h" #include "timeval.h" #include "openvswitch/vlog.h" @@ -753,6 +755,9 @@ jsonrpc_msg_to_json(struct jsonrpc_msg *m) /* A JSON-RPC session with reconnection. */ struct jsonrpc_session { + struct svec remotes; + size_t next_remote; + struct reconnect *reconnect; struct jsonrpc *rpc; struct stream *stream; @@ -762,6 +767,13 @@ struct jsonrpc_session { uint8_t dscp; }; +static void +jsonrpc_session_pick_remote(struct jsonrpc_session *s) +{ + reconnect_set_name(s->reconnect, + s->remotes.names[s->next_remote++ % s->remotes.n]); +} + /* Creates and returns a jsonrpc_session to 'name', which should be a string * acceptable to stream_open() or pstream_open(). * @@ -779,12 +791,27 @@ struct jsonrpc_session { struct jsonrpc_session * jsonrpc_session_open(const char *name, bool retry) { + const struct svec remotes = { .names = (char **) &name, .n = 1 }; + return jsonrpc_session_open_multiple(&remotes, retry); +} + +struct jsonrpc_session * +jsonrpc_session_open_multiple(const struct svec *remotes, bool retry) +{ struct jsonrpc_session *s; s = xmalloc(sizeof *s); + + /* Set 'n' remotes from 'names', shuffling them into random order. */ + ovs_assert(remotes->n > 0); + svec_clone(&s->remotes, remotes); + svec_shuffle(&s->remotes); + s->next_remote = 0; + s->reconnect = reconnect_create(time_msec()); - reconnect_set_name(s->reconnect, name); + jsonrpc_session_pick_remote(s); reconnect_enable(s->reconnect, time_msec()); + reconnect_set_backoff_free_tries(s->reconnect, remotes->n); s->rpc = NULL; s->stream = NULL; s->pstream = NULL; @@ -792,10 +819,11 @@ jsonrpc_session_open(const char *name, bool retry) s->dscp = 0; s->last_error = 0; + const char *name = reconnect_get_name(s->reconnect); if (!pstream_verify_name(name)) { reconnect_set_passive(s->reconnect, true, time_msec()); } else if (!retry) { - reconnect_set_max_tries(s->reconnect, 1); + reconnect_set_max_tries(s->reconnect, remotes->n); reconnect_set_backoff(s->reconnect, INT_MAX, INT_MAX); } @@ -817,6 +845,9 @@ jsonrpc_session_open_unreliably(struct jsonrpc *jsonrpc, uint8_t dscp) struct jsonrpc_session *s; s = xmalloc(sizeof *s); + svec_init(&s->remotes); + svec_add(&s->remotes, jsonrpc_get_name(jsonrpc)); + s->next_remote = 0; s->reconnect = reconnect_create(time_msec()); reconnect_set_quiet(s->reconnect, true); reconnect_set_name(s->reconnect, jsonrpc_get_name(jsonrpc)); @@ -839,6 +870,7 @@ jsonrpc_session_close(struct jsonrpc_session *s) reconnect_destroy(s->reconnect); stream_close(s->stream); pstream_close(s->pstream); + svec_destroy(&s->remotes); free(s); } } @@ -850,12 +882,15 @@ jsonrpc_session_disconnect(struct jsonrpc_session *s) jsonrpc_error(s->rpc, EOF); jsonrpc_close(s->rpc); s->rpc = NULL; - s->seqno++; } else if (s->stream) { stream_close(s->stream); s->stream = NULL; - s->seqno++; + } else { + return; } + + s->seqno++; + jsonrpc_session_pick_remote(s); } static void @@ -882,6 +917,7 @@ jsonrpc_session_connect(struct jsonrpc_session *s) if (error) { reconnect_connect_failed(s->reconnect, time_msec(), error); + jsonrpc_session_pick_remote(s); } } @@ -946,6 +982,7 @@ jsonrpc_session_run(struct jsonrpc_session *s) s->seqno++; } else if (error != EAGAIN) { reconnect_connect_failed(s->reconnect, time_msec(), error); + jsonrpc_session_pick_remote(s); stream_close(s->stream); s->stream = NULL; s->last_error = error; @@ -1016,6 +1053,12 @@ jsonrpc_session_get_id(const struct jsonrpc_session *s) } } +size_t +jsonrpc_session_get_n_remotes(const struct jsonrpc_session *s) +{ + return s->remotes.n; +} + /* Always takes ownership of 'msg', regardless of success. */ int jsonrpc_session_send(struct jsonrpc_session *s, struct jsonrpc_msg *msg) diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h index 9b4fb0e51374..969a6ed38cd6 100644 --- a/lib/jsonrpc.h +++ b/lib/jsonrpc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2012, 2013 Nicira, Inc. + * Copyright (c) 2009, 2010, 2012, 2013, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ struct jsonrpc_msg; struct pstream; struct reconnect_stats; struct stream; +struct svec; /* API for a JSON-RPC stream. */ @@ -99,6 +100,8 @@ struct json *jsonrpc_msg_to_json(struct jsonrpc_msg *); /* A JSON-RPC session with reconnection. */ struct jsonrpc_session *jsonrpc_session_open(const char *name, bool retry); +struct jsonrpc_session *jsonrpc_session_open_multiple(const struct svec *, + bool retry); struct jsonrpc_session *jsonrpc_session_open_unreliably(struct jsonrpc *, uint8_t); void jsonrpc_session_close(struct jsonrpc_session *); @@ -108,6 +111,7 @@ void jsonrpc_session_wait(struct jsonrpc_session *); size_t jsonrpc_session_get_backlog(const struct jsonrpc_session *); const char *jsonrpc_session_get_name(const struct jsonrpc_session *); +size_t jsonrpc_session_get_n_remotes(const struct jsonrpc_session *); int jsonrpc_session_send(struct jsonrpc_session *, struct jsonrpc_msg *); struct jsonrpc_msg *jsonrpc_session_recv(struct jsonrpc_session *); diff --git a/lib/svec.c b/lib/svec.c index 297a60ce14f9..c1b986bab108 100644 --- a/lib/svec.c +++ b/lib/svec.c @@ -20,6 +20,7 @@ #include #include #include "openvswitch/dynamic-string.h" +#include "random.h" #include "util.h" #include "openvswitch/vlog.h" @@ -174,6 +175,23 @@ svec_compact(struct svec *svec) svec->n = j; } +static void +swap_strings(char **a, char **b) +{ + char *tmp = *a; + *a = *b; + *b = tmp; +} + +void +svec_shuffle(struct svec *svec) +{ + for (size_t i = 0; i < svec->n; i++) { + size_t j = i + random_range(svec->n - i); + swap_strings(&svec->names[i], &svec->names[j]); + } +} + void svec_diff(const struct svec *a, const struct svec *b, struct svec *a_only, struct svec *both, struct svec *b_only) diff --git a/lib/svec.h b/lib/svec.h index 341e26989801..b4e1343a9069 100644 --- a/lib/svec.h +++ b/lib/svec.h @@ -46,6 +46,7 @@ void svec_sort(struct svec *); void svec_sort_unique(struct svec *); void svec_unique(struct svec *); void svec_compact(struct svec *); +void svec_shuffle(struct svec *); void svec_diff(const struct svec *a, const struct svec *b, struct svec *a_only, struct svec *both, struct svec *b_only); bool svec_contains(const struct svec *, const char *); From patchwork Mon Jan 1 05:16:29 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854287 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z957s4hxPz9t84 for ; Mon, 1 Jan 2018 16:18:33 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id B4CD2C7F; Mon, 1 Jan 2018 05:16:55 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id F1278C48 for ; Mon, 1 Jan 2018 05:16:53 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 3D35314D for ; Mon, 1 Jan 2018 05:16:53 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 223BBC5A53; Mon, 1 Jan 2018 06:16:50 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:29 -0800 Message-Id: <20180101051640.13043-4-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 04/15] ovsdb-server: Distinguish logs from other replicas. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Until now, ovsdb-server has internally chained a list of replicas from each database. Whenever ovsdb_txn_commit() commits a transaction, it passes the transaction to each replica. The first replica, which is always the disk file that stores the database, is special because it is the only replica that can report an error and thereby abort the transaction. This is a very special property that genuinely distinguishes this first replica from the others on the chain. This commit breaks that first replica out as a separate kind of entity that is not on the list of replicas. When later commits add support for clustering, there will only be more and more special cases for the "first replica", so it makes sense to distinguish it this way. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- ovsdb/file.c | 33 ++++++--------------------------- ovsdb/file.h | 7 ++++++- ovsdb/monitor.c | 4 +--- ovsdb/ovsdb.c | 7 +++++++ ovsdb/ovsdb.h | 5 +++-- ovsdb/transaction.c | 14 +++++++------- 6 files changed, 30 insertions(+), 40 deletions(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index fdd5f8e35a44..4aafb3be8ab4 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2016 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2016, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -250,6 +250,7 @@ ovsdb_file_open__(const char *file_name, if (filep) { *filep = file; } + db->file = file; } else { ovsdb_log_close(log); } @@ -500,10 +501,7 @@ ovsdb_file_read_schema(const char *file_name, struct ovsdb_schema **schemap) return ovsdb_file_open_log(file_name, OVSDB_LOG_READ_ONLY, NULL, schemap); } -/* Replica implementation. */ - struct ovsdb_file { - struct ovsdb_replica replica; struct ovsdb *db; struct ovsdb_log *log; char *file_name; @@ -512,8 +510,6 @@ struct ovsdb_file { unsigned int n_transactions; }; -static const struct ovsdb_replica_class ovsdb_file_class; - static struct ovsdb_error * ovsdb_file_create(struct ovsdb *db, struct ovsdb_log *log, const char *file_name, @@ -535,26 +531,17 @@ ovsdb_file_create(struct ovsdb *db, struct ovsdb_log *log, } file = xmalloc(sizeof *file); - ovsdb_replica_init(&file->replica, &ovsdb_file_class); file->db = db; file->log = log; file->file_name = abs_name; file->last_compact = time_msec(); file->next_compact = file->last_compact + COMPACT_MIN_MSEC; file->n_transactions = n_transactions; - ovsdb_add_replica(db, &file->replica); *filep = file; return NULL; } -static struct ovsdb_file * -ovsdb_file_cast(struct ovsdb_replica *replica) -{ - ovs_assert(replica->class == &ovsdb_file_class); - return CONTAINER_OF(replica, struct ovsdb_file, replica); -} - static bool ovsdb_file_change_cb(const struct ovsdb_row *old, const struct ovsdb_row *new, @@ -579,11 +566,10 @@ ovsdb_file_txn_annotate(struct json *json, const char *comment) return json; } -static struct ovsdb_error * -ovsdb_file_commit(struct ovsdb_replica *replica, +struct ovsdb_error * +ovsdb_file_commit(struct ovsdb_file *file, const struct ovsdb_txn *txn, bool durable) { - struct ovsdb_file *file = ovsdb_file_cast(replica); struct ovsdb_file_txn ftxn; struct ovsdb_error *error; @@ -764,20 +750,13 @@ exit: return error; } -static void -ovsdb_file_destroy(struct ovsdb_replica *replica) +void +ovsdb_file_destroy(struct ovsdb_file *file) { - struct ovsdb_file *file = ovsdb_file_cast(replica); - ovsdb_log_close(file->log); free(file->file_name); free(file); } - -static const struct ovsdb_replica_class ovsdb_file_class = { - ovsdb_file_commit, - ovsdb_file_destroy -}; static void ovsdb_file_txn_init(struct ovsdb_file_txn *ftxn) diff --git a/ovsdb/file.h b/ovsdb/file.h index c6ca92621044..a9ef0585b261 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ struct ovsdb; struct ovsdb_file; struct ovsdb_schema; +struct ovsdb_txn; struct ovsdb_error *ovsdb_file_open(const char *file_name, bool read_only, struct ovsdb **, struct ovsdb_file **) @@ -44,6 +45,10 @@ struct ovsdb_error *ovsdb_file_read_schema(const char *file_name, struct ovsdb_schema **) OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *ovsdb_file_commit(struct ovsdb_file *, + const struct ovsdb_txn *, bool durable); +void ovsdb_file_destroy(struct ovsdb_file *); + struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); #endif /* ovsdb/file.h */ diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index c0f9c557ab67..b2ecd109ed60 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -1573,7 +1573,7 @@ ovsdb_monitor_destroy(struct ovsdb_monitor *dbmon) free(dbmon); } -static struct ovsdb_error * +static void ovsdb_monitor_commit(struct ovsdb_replica *replica, const struct ovsdb_txn *txn, bool durable OVS_UNUSED) @@ -1601,8 +1601,6 @@ ovsdb_monitor_commit(struct ovsdb_replica *replica, ovsdb_monitor_json_cache_flush(m); break; } - - return NULL; } static void diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index d8f441ad0728..213d0d4823e1 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -18,6 +18,7 @@ #include "ovsdb.h" #include "column.h" +#include "file.h" #include "openvswitch/json.h" #include "ovsdb-error.h" #include "ovsdb-parser.h" @@ -328,6 +329,7 @@ ovsdb_create(struct ovsdb_schema *schema) db = xmalloc(sizeof *db); db->schema = schema; + db->file = NULL; ovs_list_init(&db->replicas); ovs_list_init(&db->triggers); db->run_triggers = false; @@ -363,6 +365,11 @@ ovsdb_destroy(struct ovsdb *db) if (db) { struct shash_node *node; + /* Close the log. */ + if (db->file) { + ovsdb_file_destroy(db->file); + } + /* Remove all the replicas. */ while (!ovs_list_is_empty(&db->replicas)) { struct ovsdb_replica *r diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index 89bbfa2512fa..06cd3a72e49e 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -56,6 +56,7 @@ bool ovsdb_schema_equal(const struct ovsdb_schema *, /* Database. */ struct ovsdb { struct ovsdb_schema *schema; + struct ovsdb_file *file; /* If nonnull, log for transactions. */ struct ovs_list replicas; /* Contains "struct ovsdb_replica"s. */ struct shash tables; /* Contains "struct ovsdb_table *"s. */ @@ -87,8 +88,8 @@ struct ovsdb_replica { }; struct ovsdb_replica_class { - struct ovsdb_error *(*commit)(struct ovsdb_replica *, - const struct ovsdb_txn *, bool durable); + void (*commit)(struct ovsdb_replica *, + const struct ovsdb_txn *, bool durable); void (*destroy)(struct ovsdb_replica *); }; diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index f47d45fca397..ba17834aa633 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "bitmap.h" #include "openvswitch/dynamic-string.h" +#include "file.h" #include "hash.h" #include "openvswitch/hmap.h" #include "openvswitch/json.h" @@ -865,17 +866,16 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) } /* Send the commit to each replica. */ - LIST_FOR_EACH (replica, node, &txn->db->replicas) { - error = (replica->class->commit)(replica, txn, durable); + if (txn->db->file) { + error = ovsdb_file_commit(txn->db->file, txn, durable); if (error) { - /* We don't support two-phase commit so only the first replica is - * allowed to report an error. */ - ovs_assert(&replica->node == txn->db->replicas.next); - ovsdb_txn_abort(txn); return error; } } + LIST_FOR_EACH (replica, node, &txn->db->replicas) { + replica->class->commit(replica, txn, durable); + } /* Finalize commit. */ txn->db->run_triggers = true; From patchwork Mon Jan 1 05:16:30 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854288 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z958R48Zvz9t84 for ; Mon, 1 Jan 2018 16:19:03 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 9C749C87; Mon, 1 Jan 2018 05:16:56 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 6DAA8C26 for ; Mon, 1 Jan 2018 05:16:55 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 99BE714B for ; Mon, 1 Jan 2018 05:16:54 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 8C3DAC5A44; Mon, 1 Jan 2018 06:16:52 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:30 -0800 Message-Id: <20180101051640.13043-5-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 05/15] ovsdb: Drop distinction between monitors and replicas. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Until now, OVSDB distinguished "monitors", which are associated with OVSDB JSON-RPC client sessions and allow clients to find out about database changes, from "replicas", which are associated with databases and also find out about database changes and act on them in some way. Now that committing to disk has been broken into a separate concept, there is a one-to-one and "onto" relationship between monitors and replicas: every monitor M has a replica R and R is associated with M as well. It's easier if we just treat them as a single entity, and that's what this commit implements. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- ovsdb/monitor.c | 56 +++++++++++++++++++++++++---------------------------- ovsdb/monitor.h | 8 +++++++- ovsdb/ovsdb.c | 32 ++++-------------------------- ovsdb/ovsdb.h | 21 +------------------- ovsdb/transaction.c | 6 ++---- 5 files changed, 40 insertions(+), 83 deletions(-) diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index b2ecd109ed60..3e58c3fbd274 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -41,7 +41,6 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_monitor); -static const struct ovsdb_replica_class ovsdb_jsonrpc_replica_class; static struct hmap ovsdb_monitors = HMAP_INITIALIZER(&ovsdb_monitors); /* Keep state of session's conditions */ @@ -66,7 +65,7 @@ struct ovsdb_monitor_table_condition { /* A collection of tables being monitored. */ struct ovsdb_monitor { - struct ovsdb_replica replica; + struct ovs_list list_node; /* In struct ovsdb's "monitors" list. */ struct shash tables; /* Holds "struct ovsdb_monitor_table"s. */ struct ovs_list jsonrpc_monitors; /* Contains "jsonrpc_monitor_node"s. */ struct ovsdb *db; @@ -239,13 +238,6 @@ compare_ovsdb_monitor_column(const void *a_, const void *b_) return a->column < b->column ? -1 : a->column > b->column; } -static struct ovsdb_monitor * -ovsdb_monitor_cast(struct ovsdb_replica *replica) -{ - ovs_assert(replica->class == &ovsdb_jsonrpc_replica_class); - return CONTAINER_OF(replica, struct ovsdb_monitor, replica); -} - /* Finds and returns the ovsdb_monitor_row in 'mt->changes->rows' for the * given 'uuid', or NULL if there is no such row. */ static struct ovsdb_monitor_row * @@ -380,8 +372,7 @@ ovsdb_monitor_create(struct ovsdb *db, dbmon = xzalloc(sizeof *dbmon); - ovsdb_replica_init(&dbmon->replica, &ovsdb_jsonrpc_replica_class); - ovsdb_add_replica(db, &dbmon->replica); + ovs_list_push_back(&db->monitors, &dbmon->list_node); ovs_list_init(&dbmon->jsonrpc_monitors); dbmon->db = db; dbmon->n_transactions = 0; @@ -1547,7 +1538,7 @@ ovsdb_monitor_destroy(struct ovsdb_monitor *dbmon) { struct shash_node *node; - ovs_list_remove(&dbmon->replica.node); + ovs_list_remove(&dbmon->list_node); if (!hmap_node_is_null(&dbmon->hmap_node)) { hmap_remove(&ovsdb_monitors, &dbmon->hmap_node); @@ -1574,11 +1565,8 @@ ovsdb_monitor_destroy(struct ovsdb_monitor *dbmon) } static void -ovsdb_monitor_commit(struct ovsdb_replica *replica, - const struct ovsdb_txn *txn, - bool durable OVS_UNUSED) +ovsdb_monitor_commit(struct ovsdb_monitor *m, const struct ovsdb_txn *txn) { - struct ovsdb_monitor *m = ovsdb_monitor_cast(replica); struct ovsdb_monitor_aux aux; ovsdb_monitor_init_aux(&aux, m); @@ -1603,17 +1591,30 @@ ovsdb_monitor_commit(struct ovsdb_replica *replica, } } -static void -ovsdb_monitor_destroy_callback(struct ovsdb_replica *replica) +void +ovsdb_monitors_commit(struct ovsdb *db, const struct ovsdb_txn *txn) { - struct ovsdb_monitor *dbmon = ovsdb_monitor_cast(replica); - struct jsonrpc_monitor_node *jm, *next; + struct ovsdb_monitor *m; - /* Delete all front end monitors. Removing the last front - * end monitor will also destroy the corresponding 'ovsdb_monitor'. - * ovsdb monitor will also be destroied. */ - LIST_FOR_EACH_SAFE(jm, next, node, &dbmon->jsonrpc_monitors) { - ovsdb_jsonrpc_monitor_destroy(jm->jsonrpc_monitor); + LIST_FOR_EACH (m, list_node, &db->monitors) { + ovsdb_monitor_commit(m, txn); + } +} + +void +ovsdb_monitors_remove(struct ovsdb *db) +{ + struct ovsdb_monitor *m, *next_m; + + LIST_FOR_EACH_SAFE (m, next_m, list_node, &db->monitors) { + struct jsonrpc_monitor_node *jm, *next_jm; + + /* Delete all front end monitors. Removing the last front + * end monitor will also destroy the corresponding 'ovsdb_monitor'. + * ovsdb monitor will also be destroied. */ + LIST_FOR_EACH_SAFE (jm, next_jm, node, &m->jsonrpc_monitors) { + ovsdb_jsonrpc_monitor_destroy(jm->jsonrpc_monitor); + } } } @@ -1629,8 +1630,3 @@ ovsdb_monitor_get_memory_usage(struct simap *usage) simap_increase(usage, "json-caches", hmap_count(&dbmon->json_cache)); } } - -static const struct ovsdb_replica_class ovsdb_jsonrpc_replica_class = { - ovsdb_monitor_commit, - ovsdb_monitor_destroy_callback, -}; diff --git a/ovsdb/monitor.h b/ovsdb/monitor.h index 21f27c62160d..99d43c45dff9 100644 --- a/ovsdb/monitor.h +++ b/ovsdb/monitor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Nicira, Inc. + * Copyright (c) 2015, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,14 @@ #ifndef OVSDB_MONITOR_H #define OVSDB_MONITOR_H +struct ovsdb; +struct ovsdb_column; struct ovsdb_monitor; struct ovsdb_jsonrpc_monitor; struct ovsdb_monitor_session_condition; struct ovsdb_condition; +struct ovsdb_txn; +struct simap; enum ovsdb_monitor_selection { OJMS_NONE = 0, /* None for this iteration */ @@ -42,6 +46,8 @@ enum ovsdb_monitor_version { struct ovsdb_monitor *ovsdb_monitor_create(struct ovsdb *db, struct ovsdb_jsonrpc_monitor *jsonrpc_monitor); +void ovsdb_monitors_remove(struct ovsdb *); +void ovsdb_monitors_commit(struct ovsdb *, const struct ovsdb_txn *); struct ovsdb_monitor *ovsdb_monitor_add(struct ovsdb_monitor *dbmon); diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 213d0d4823e1..19755e673861 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -19,6 +19,7 @@ #include "column.h" #include "file.h" +#include "monitor.h" #include "openvswitch/json.h" #include "ovsdb-error.h" #include "ovsdb-parser.h" @@ -330,7 +331,7 @@ ovsdb_create(struct ovsdb_schema *schema) db = xmalloc(sizeof *db); db->schema = schema; db->file = NULL; - ovs_list_init(&db->replicas); + ovs_list_init(&db->monitors); ovs_list_init(&db->triggers); db->run_triggers = false; @@ -370,13 +371,8 @@ ovsdb_destroy(struct ovsdb *db) ovsdb_file_destroy(db->file); } - /* Remove all the replicas. */ - while (!ovs_list_is_empty(&db->replicas)) { - struct ovsdb_replica *r - = CONTAINER_OF(ovs_list_pop_back(&db->replicas), - struct ovsdb_replica, node); - ovsdb_remove_replica(db, r); - } + /* Remove all the monitors. */ + ovsdb_monitors_remove(db); /* Delete all the tables. This also deletes their schemas. */ SHASH_FOR_EACH (node, &db->tables) { @@ -419,23 +415,3 @@ ovsdb_get_table(const struct ovsdb *db, const char *name) { return shash_find_data(&db->tables, name); } - -void -ovsdb_replica_init(struct ovsdb_replica *r, - const struct ovsdb_replica_class *class) -{ - r->class = class; -} - -void -ovsdb_add_replica(struct ovsdb *db, struct ovsdb_replica *r) -{ - ovs_list_push_back(&db->replicas, &r->node); -} - -void -ovsdb_remove_replica(struct ovsdb *db OVS_UNUSED, struct ovsdb_replica *r) -{ - ovs_list_remove(&r->node); - (r->class->destroy)(r); -} diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index 06cd3a72e49e..9d915f0f15ae 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -57,7 +57,7 @@ bool ovsdb_schema_equal(const struct ovsdb_schema *, struct ovsdb { struct ovsdb_schema *schema; struct ovsdb_file *file; /* If nonnull, log for transactions. */ - struct ovs_list replicas; /* Contains "struct ovsdb_replica"s. */ + struct ovs_list monitors; /* Contains "struct ovsdb_monitor"s. */ struct shash tables; /* Contains "struct ovsdb_table *"s. */ /* Triggers. */ @@ -79,24 +79,5 @@ struct json *ovsdb_execute(struct ovsdb *, const struct ovsdb_session *, const char *role, const char *id, long long int elapsed_msec, long long int *timeout_msec); - -/* Database replication. */ - -struct ovsdb_replica { - struct ovs_list node; /* Element in "struct ovsdb" replicas list. */ - const struct ovsdb_replica_class *class; -}; - -struct ovsdb_replica_class { - void (*commit)(struct ovsdb_replica *, - const struct ovsdb_txn *, bool durable); - void (*destroy)(struct ovsdb_replica *); -}; - -void ovsdb_replica_init(struct ovsdb_replica *, - const struct ovsdb_replica_class *); - -void ovsdb_add_replica(struct ovsdb *, struct ovsdb_replica *); -void ovsdb_remove_replica(struct ovsdb *, struct ovsdb_replica *); #endif /* ovsdb/ovsdb.h */ diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index ba17834aa633..f1502ffb398c 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -21,6 +21,7 @@ #include "openvswitch/dynamic-string.h" #include "file.h" #include "hash.h" +#include "monitor.h" #include "openvswitch/hmap.h" #include "openvswitch/json.h" #include "openvswitch/list.h" @@ -808,7 +809,6 @@ update_version(struct ovsdb_txn *txn OVS_UNUSED, struct ovsdb_txn_row *txn_row) static struct ovsdb_error * ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) { - struct ovsdb_replica *replica; struct ovsdb_error *error; /* Figure out what actually changed, and abort early if the transaction @@ -873,9 +873,7 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) return error; } } - LIST_FOR_EACH (replica, node, &txn->db->replicas) { - replica->class->commit(replica, txn, durable); - } + ovsdb_monitors_commit(txn->db, txn); /* Finalize commit. */ txn->db->run_triggers = true; From patchwork Mon Jan 1 05:16:31 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854289 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95931tZ9z9t84 for ; Mon, 1 Jan 2018 16:19:35 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 87A8CCA0; Mon, 1 Jan 2018 05:16:59 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id A8DFEC96 for ; Mon, 1 Jan 2018 05:16:56 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 1C2EC14B for ; Mon, 1 Jan 2018 05:16:56 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id E0B4FC5A55; Mon, 1 Jan 2018 06:16:53 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:31 -0800 Message-Id: <20180101051640.13043-6-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 06/15] jsonrpc-server: Separate changing read_only status from reconnecting. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org The code in jsonrpc-server conflated two different kinds of functionality. It makes sense for the client to be able to change whether a particular server is read-only. It also makes sense for the client to tell a server to reconnect. The code in jsonrpc-server only provided a single function that does both, which is weird. This commit breaks these apart. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- ovsdb/jsonrpc-server.c | 19 ++++++++++++++----- ovsdb/jsonrpc-server.h | 7 +++++-- ovsdb/ovsdb-server.c | 15 +++------------ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 05e0d73ae0be..27586cddd8b3 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -165,7 +165,7 @@ ovsdb_jsonrpc_server_add_db(struct ovsdb_jsonrpc_server *svr, struct ovsdb *db) * If this is too big of a hammer in practice, we could be more selective, * e.g. disconnect only connections that actually tried to use a database * with 'db''s name. */ - ovsdb_jsonrpc_server_reconnect(svr, svr->read_only); + ovsdb_jsonrpc_server_reconnect(svr); return ovsdb_server_add_db(&svr->up, db); } @@ -182,7 +182,7 @@ ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *svr, * * If this is too big of a hammer in practice, we could be more selective, * e.g. disconnect only connections that actually reference 'db'. */ - ovsdb_jsonrpc_server_reconnect(svr, svr->read_only); + ovsdb_jsonrpc_server_reconnect(svr); return ovsdb_server_remove_db(&svr->up, db); } @@ -336,11 +336,10 @@ ovsdb_jsonrpc_server_free_remote_status( /* Forces all of the JSON-RPC sessions managed by 'svr' to disconnect and * reconnect. */ void -ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr, bool read_only) +ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr) { struct shash_node *node; - svr->read_only = read_only; SHASH_FOR_EACH (node, &svr->remotes) { struct ovsdb_jsonrpc_remote *remote = node->data; @@ -349,12 +348,22 @@ ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr, bool read_only) } bool -ovsdb_jsonrpc_server_is_read_only(struct ovsdb_jsonrpc_server *svr) +ovsdb_jsonrpc_server_is_read_only(const struct ovsdb_jsonrpc_server *svr) { return svr->read_only; } void +ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *svr, + bool read_only) +{ + if (svr->read_only != read_only) { + svr->read_only = read_only; + ovsdb_jsonrpc_server_reconnect(svr); + } +} + +void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr) { struct shash_node *node; diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 1add3276d3b6..a3acc75f8d4f 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -64,11 +64,14 @@ bool ovsdb_jsonrpc_server_get_remote_status( void ovsdb_jsonrpc_server_free_remote_status( struct ovsdb_jsonrpc_remote_status *); -void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool read_only); +void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *); void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *); void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *); -bool ovsdb_jsonrpc_server_is_read_only(struct ovsdb_jsonrpc_server *); + +bool ovsdb_jsonrpc_server_is_read_only(const struct ovsdb_jsonrpc_server *); +void ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *, + bool read_only); void ovsdb_jsonrpc_server_get_memory_usage(const struct ovsdb_jsonrpc_server *, struct simap *usage); diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 7f2d19ef568b..1efb5552da5a 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -156,7 +156,6 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, char *remotes_error, *ssl_error; struct shash_node *node; long long int status_timer = LLONG_MIN; - bool last_role = *is_backup; *exiting = false; ssl_error = NULL; @@ -182,12 +181,7 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, * the set of remotes that reconfigure_remotes() uses. */ unixctl_server_run(unixctl); - /* In ovsdb-server's role (active or backup) has changed, restart - * the ovsdb jsonrpc server. */ - if (last_role != *is_backup) { - bool read_only = last_role = *is_backup; - ovsdb_jsonrpc_server_reconnect(jsonrpc, read_only); - } + ovsdb_jsonrpc_server_set_read_only(jsonrpc, *is_backup); report_error_if_changed( reconfigure_remotes(jsonrpc, all_dbs, remotes), @@ -1125,10 +1119,9 @@ ovsdb_server_disable_monitor_cond(struct unixctl_conn *conn, void *jsonrpc_) { struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; - bool read_only = ovsdb_jsonrpc_server_is_read_only(jsonrpc); ovsdb_jsonrpc_disable_monitor_cond(); - ovsdb_jsonrpc_server_reconnect(jsonrpc, read_only); + ovsdb_jsonrpc_server_reconnect(jsonrpc); unixctl_command_reply(conn, NULL); } @@ -1186,9 +1179,7 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *jsonrpc_) { struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; - bool read_only = ovsdb_jsonrpc_server_is_read_only(jsonrpc); - - ovsdb_jsonrpc_server_reconnect(jsonrpc, read_only); + ovsdb_jsonrpc_server_reconnect(jsonrpc); unixctl_command_reply(conn, NULL); } From patchwork Mon Jan 1 05:16:32 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854291 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95BG00q1z9t84 for ; Mon, 1 Jan 2018 16:20:37 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 9D802CC2; Mon, 1 Jan 2018 05:17:03 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id B82B9CC0 for ; Mon, 1 Jan 2018 05:17:02 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 8E95E18A for ; Mon, 1 Jan 2018 05:16:58 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 6EA98C5A49; Mon, 1 Jan 2018 06:16:55 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:32 -0800 Message-Id: <20180101051640.13043-7-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 07/15] ovsdb-idl: Break out database-specific stuff into new data structure. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Until now, a given ovsdb-idl instances has only monitored a single database. In an upcoming commit, it will grow to also monitor a second database that represents the state of the database server itself. Much of the work is the same for both databases, so this commit breaks the common code and data out into new data structures and functions. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- lib/ovsdb-idl-provider.h | 2 +- lib/ovsdb-idl.c | 1094 ++++++++++++++++++++++++++-------------------- lib/ovsdb-idl.h | 2 +- 3 files changed, 620 insertions(+), 478 deletions(-) diff --git a/lib/ovsdb-idl-provider.h b/lib/ovsdb-idl-provider.h index b0ebed44f83a..0337303511f0 100644 --- a/lib/ovsdb-idl-provider.h +++ b/lib/ovsdb-idl-provider.h @@ -115,7 +115,7 @@ struct ovsdb_idl_table { * for replication. */ struct shash columns; /* Contains "const struct ovsdb_idl_column *"s. */ struct hmap rows; /* Contains "struct ovsdb_idl_row"s. */ - struct ovsdb_idl *idl; /* Containing idl. */ + struct ovsdb_idl_db *db; /* Containing db. */ unsigned int change_seqno[OVSDB_IDL_CHANGE_MAX]; struct shash indexes; /* Contains "struct ovsdb_idl_index"s */ struct ovs_list track_list; /* Tracked rows (ovsdb_idl_row.track_node). */ diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index 29f893116aee..24ba5b50fddc 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -117,12 +117,55 @@ enum ovsdb_idl_state { IDL_S_NO_SCHEMA }; -struct ovsdb_idl { +struct ovsdb_idl_db { + struct ovsdb_idl *idl; + /* Data. */ const struct ovsdb_idl_class *class_; struct shash table_by_name; /* Contains "struct ovsdb_idl_table *"s.*/ struct ovsdb_idl_table *tables; /* Array of ->class_->n_tables elements. */ + struct json *monitor_id; unsigned int change_seqno; + struct ovsdb_idl_txn *txn; + struct hmap outstanding_txns; + bool verify_write_only; + struct json *schema; + + /* True if any of the tables' monitoring conditions has changed. */ + bool cond_changed; + + unsigned int cond_seqno; /* Keep track of condition clauses changes + over a single conditional monitoring session. + Reverts to zero when idl session + reconnects. */ + + /* Database locking. */ + char *lock_name; /* Name of lock we need, NULL if none. */ + bool has_lock; /* Has db server told us we have the lock? */ + bool is_lock_contended; /* Has db server told us we can't get lock? */ + struct json *lock_request_id; /* JSON-RPC ID of in-flight lock request. */ +}; + +static void ovsdb_idl_db_track_clear(struct ovsdb_idl_db *); +static void ovsdb_idl_db_add_column(struct ovsdb_idl_db *, + const struct ovsdb_idl_column *); +static void ovsdb_idl_db_omit(struct ovsdb_idl_db *, + const struct ovsdb_idl_column *); +static void ovsdb_idl_db_omit_alert(struct ovsdb_idl_db *, + const struct ovsdb_idl_column *); +static unsigned int ovsdb_idl_db_set_condition( + struct ovsdb_idl_db *, const struct ovsdb_idl_table_class *, + const struct ovsdb_idl_condition *); + +static void ovsdb_idl_send_schema_request(struct ovsdb_idl *, + struct ovsdb_idl_db *); +static void ovsdb_idl_send_monitor_request(struct ovsdb_idl *, + struct ovsdb_idl_db *, + bool use_monitor_cond); + +struct ovsdb_idl { + struct ovsdb_idl_db server; + struct ovsdb_idl_db db; /* XXX rename 'data'? */ /* Session state. * @@ -134,31 +177,14 @@ struct ovsdb_idl { enum ovsdb_idl_state state; /* Current session state. */ unsigned int state_seqno; /* See above. */ struct json *request_id; /* JSON ID for request awaiting reply. */ - struct json *schema; /* Temporary copy of database schema. */ - /* Database locking. */ - char *lock_name; /* Name of lock we need, NULL if none. */ - bool has_lock; /* Has db server told us we have the lock? */ - bool is_lock_contended; /* Has db server told us we can't get lock? */ - struct json *lock_request_id; /* JSON-RPC ID of in-flight lock request. */ - - /* Transaction support. */ - struct ovsdb_idl_txn *txn; - struct hmap outstanding_txns; - bool verify_write_only; - - /* Conditional monitoring. */ - bool cond_changed; - unsigned int cond_seqno; /* Keep track of condition clauses changes - over a single conditional monitoring session. - Reverts to zero when idl session - reconnects. */ + bool use_monitor_cond; }; struct ovsdb_idl_txn { struct hmap_node hmap_node; struct json *request_id; - struct ovsdb_idl *idl; + struct ovsdb_idl_db *db; struct hmap txn_rows; enum ovsdb_idl_txn_status status; char *error; @@ -184,30 +210,19 @@ struct ovsdb_idl_txn_insert { struct uuid real; /* Real UUID used by database server. */ }; -enum ovsdb_update_version { - OVSDB_UPDATE, /* RFC 7047 "update" method. */ - OVSDB_UPDATE2 /* "update2" Extension to RFC 7047. - See ovsdb-server(1) for more information. */ -}; - -/* Name arrays indexed by 'enum ovsdb_update_version'. */ -static const char *table_updates_names[] = {"table_updates", "table_updates2"}; -static const char *table_update_names[] = {"table_update", "table_update2"}; -static const char *row_update_names[] = {"row_update", "row_update2"}; - static struct vlog_rate_limit syntax_rl = VLOG_RATE_LIMIT_INIT(1, 5); static struct vlog_rate_limit semantic_rl = VLOG_RATE_LIMIT_INIT(1, 5); static struct vlog_rate_limit other_rl = VLOG_RATE_LIMIT_INIT(1, 5); static void ovsdb_idl_clear(struct ovsdb_idl *); -static void ovsdb_idl_send_schema_request(struct ovsdb_idl *); -static void ovsdb_idl_send_monitor_request(struct ovsdb_idl *); -static void ovsdb_idl_send_monitor_cond_request(struct ovsdb_idl *); -static void ovsdb_idl_parse_update(struct ovsdb_idl *, const struct json *, - enum ovsdb_update_version); -static struct ovsdb_error *ovsdb_idl_parse_update__(struct ovsdb_idl *, - const struct json *, - enum ovsdb_update_version); +static void ovsdb_idl_db_parse_monitor_reply(struct ovsdb_idl_db *, + const struct json *result, + bool is_monitor_cond); +static bool ovsdb_idl_db_parse_update_rpc(struct ovsdb_idl_db *, + const struct jsonrpc_msg *); +static void ovsdb_idl_db_parse_update(struct ovsdb_idl_db *, + const struct json *table_updates, + bool is_monitor_cond); static bool ovsdb_idl_process_update(struct ovsdb_idl_table *, const struct uuid *, const struct json *old, @@ -228,7 +243,7 @@ static struct ovsdb_idl_row *ovsdb_idl_row_create__( static struct ovsdb_idl_row *ovsdb_idl_row_create(struct ovsdb_idl_table *, const struct uuid *); static void ovsdb_idl_row_destroy(struct ovsdb_idl_row *); -static void ovsdb_idl_row_destroy_postprocess(struct ovsdb_idl *); +static void ovsdb_idl_row_destroy_postprocess(struct ovsdb_idl_db *); static void ovsdb_idl_destroy_all_map_op_lists(struct ovsdb_idl_row *); static void ovsdb_idl_destroy_all_set_op_lists(struct ovsdb_idl_row *); @@ -239,8 +254,8 @@ static void ovsdb_idl_row_clear_new(struct ovsdb_idl_row *); static void ovsdb_idl_row_clear_arcs(struct ovsdb_idl_row *, bool destroy_dsts); static void ovsdb_idl_txn_abort_all(struct ovsdb_idl *); -static bool ovsdb_idl_txn_process_reply(struct ovsdb_idl *, - const struct jsonrpc_msg *msg); +static bool ovsdb_idl_db_txn_process_reply(struct ovsdb_idl_db *, + const struct jsonrpc_msg *msg); static bool ovsdb_idl_txn_extract_mutations(struct ovsdb_idl_row *, struct json *); static void ovsdb_idl_txn_add_map_op(struct ovsdb_idl_row *, @@ -252,13 +267,20 @@ static void ovsdb_idl_txn_add_set_op(struct ovsdb_idl_row *, struct ovsdb_datum *, enum set_op_type); -static void ovsdb_idl_send_lock_request(struct ovsdb_idl *); -static void ovsdb_idl_send_unlock_request(struct ovsdb_idl *); -static void ovsdb_idl_parse_lock_reply(struct ovsdb_idl *, - const struct json *); -static void ovsdb_idl_parse_lock_notify(struct ovsdb_idl *, - const struct json *params, - bool new_has_lock); +static bool ovsdb_idl_db_process_lock_replies(struct ovsdb_idl_db *, + const struct jsonrpc_msg *); +static struct jsonrpc_msg *ovsdb_idl_db_compose_lock_request( + struct ovsdb_idl_db *); +static struct jsonrpc_msg *ovsdb_idl_db_compose_unlock_request( + struct ovsdb_idl_db *); +static void ovsdb_idl_db_parse_lock_reply(struct ovsdb_idl_db *, + const struct json *); +static bool ovsdb_idl_db_parse_lock_notify(struct ovsdb_idl_db *, + const struct json *params, + bool new_has_lock); +static struct ovsdb_idl_table * +ovsdb_idl_db_table_from_class(const struct ovsdb_idl_db *, + const struct ovsdb_idl_table_class *); static struct ovsdb_idl_table * ovsdb_idl_table_from_class(const struct ovsdb_idl *, const struct ovsdb_idl_table_class *); @@ -273,6 +295,51 @@ static void static void ovsdb_idl_add_to_indexes(const struct ovsdb_idl_row *); static void ovsdb_idl_remove_from_indexes(const struct ovsdb_idl_row *); +static void +ovsdb_idl_db_init(struct ovsdb_idl_db *db, const struct ovsdb_idl_class *class, + struct ovsdb_idl *parent, bool monitor_everything_by_default) +{ + memset(db, 0, sizeof *db); + + uint8_t default_mode = (monitor_everything_by_default + ? OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT + : 0); + + db->idl = parent; + db->class_ = class; + shash_init(&db->table_by_name); + db->tables = xmalloc(class->n_tables * sizeof *db->tables); + for (size_t i = 0; i < class->n_tables; i++) { + const struct ovsdb_idl_table_class *tc = &class->tables[i]; + struct ovsdb_idl_table *table = &db->tables[i]; + + shash_add_assert(&db->table_by_name, tc->name, table); + table->class_ = tc; + table->modes = xmalloc(tc->n_columns); + memset(table->modes, default_mode, tc->n_columns); + table->need_table = false; + shash_init(&table->columns); + shash_init(&table->indexes); + for (size_t j = 0; j < tc->n_columns; j++) { + const struct ovsdb_idl_column *column = &tc->columns[j]; + + shash_add_assert(&table->columns, column->name, column); + } + hmap_init(&table->rows); + ovs_list_init(&table->track_list); + table->change_seqno[OVSDB_IDL_CHANGE_INSERT] + = table->change_seqno[OVSDB_IDL_CHANGE_MODIFY] + = table->change_seqno[OVSDB_IDL_CHANGE_DELETE] = 0; + table->db = db; + ovsdb_idl_condition_init(&table->condition); + ovsdb_idl_condition_add_clause_true(&table->condition); + table->cond_changed = false; + } + db->monitor_id = json_array_create_2(json_string_create("monid"), + json_string_create(class->database)); + hmap_init(&db->outstanding_txns); +} + /* Creates and returns a connection to database 'remote', which should be in a * form acceptable to jsonrpc_session_open(). The connection will maintain an * in-memory replica of the remote database whose schema is described by @@ -296,53 +363,12 @@ ovsdb_idl_create(const char *remote, const struct ovsdb_idl_class *class, bool monitor_everything_by_default, bool retry) { struct ovsdb_idl *idl; - uint8_t default_mode; - size_t i; - - default_mode = (monitor_everything_by_default - ? OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT - : 0); idl = xzalloc(sizeof *idl); - idl->class_ = class; + ovsdb_idl_db_init(&idl->db, class, idl, monitor_everything_by_default); idl->session = jsonrpc_session_open(remote, retry); - shash_init(&idl->table_by_name); - idl->tables = xmalloc(class->n_tables * sizeof *idl->tables); - for (i = 0; i < class->n_tables; i++) { - const struct ovsdb_idl_table_class *tc = &class->tables[i]; - struct ovsdb_idl_table *table = &idl->tables[i]; - size_t j; - - shash_add_assert(&idl->table_by_name, tc->name, table); - table->class_ = tc; - table->modes = xmalloc(tc->n_columns); - memset(table->modes, default_mode, tc->n_columns); - table->need_table = false; - shash_init(&table->columns); - shash_init(&table->indexes); - for (j = 0; j < tc->n_columns; j++) { - const struct ovsdb_idl_column *column = &tc->columns[j]; - - shash_add_assert(&table->columns, column->name, column); - } - hmap_init(&table->rows); - ovs_list_init(&table->track_list); - table->change_seqno[OVSDB_IDL_CHANGE_INSERT] - = table->change_seqno[OVSDB_IDL_CHANGE_MODIFY] - = table->change_seqno[OVSDB_IDL_CHANGE_DELETE] = 0; - table->idl = idl; - ovsdb_idl_condition_init(&table->condition); - ovsdb_idl_condition_add_clause_true(&table->condition); - table->cond_changed = false; - } - - idl->cond_changed = false; - idl->cond_seqno = 0; idl->state_seqno = UINT_MAX; idl->request_id = NULL; - idl->schema = NULL; - - hmap_init(&idl->outstanding_txns); return idl; } @@ -353,51 +379,54 @@ ovsdb_idl_set_remote(struct ovsdb_idl *idl, const char *remote, bool retry) { if (idl) { - ovs_assert(!idl->txn); - jsonrpc_session_close(idl->session); idl->session = jsonrpc_session_open(remote, retry); + /* XXX update condition */ idl->state_seqno = UINT_MAX; } } +static void +ovsdb_idl_db_destroy(struct ovsdb_idl_db *db) +{ + ovs_assert(!db->txn); + for (size_t i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; + ovsdb_idl_condition_destroy(&table->condition); + ovsdb_idl_destroy_indexes(table); + shash_destroy(&table->columns); + hmap_destroy(&table->rows); + free(table->modes); + } + shash_destroy(&db->table_by_name); + free(db->tables); + json_destroy(db->schema); + hmap_destroy(&db->outstanding_txns); + free(db->lock_name); + json_destroy(db->lock_request_id); +} + /* Destroys 'idl' and all of the data structures that it manages. */ void ovsdb_idl_destroy(struct ovsdb_idl *idl) { if (idl) { - size_t i; - - ovs_assert(!idl->txn); ovsdb_idl_clear(idl); jsonrpc_session_close(idl->session); - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; - ovsdb_idl_condition_destroy(&table->condition); - ovsdb_idl_destroy_indexes(table); - shash_destroy(&table->columns); - hmap_destroy(&table->rows); - free(table->modes); - } - shash_destroy(&idl->table_by_name); - free(idl->tables); + ovsdb_idl_db_destroy(&idl->db); json_destroy(idl->request_id); - free(idl->lock_name); - json_destroy(idl->lock_request_id); - json_destroy(idl->schema); - hmap_destroy(&idl->outstanding_txns); free(idl); } } static void -ovsdb_idl_clear(struct ovsdb_idl *idl) +ovsdb_idl_db_clear(struct ovsdb_idl_db *db) { bool changed = false; size_t i; - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; struct ovsdb_idl_row *row, *next_row; table->cond_changed = false; @@ -427,15 +456,29 @@ ovsdb_idl_clear(struct ovsdb_idl *idl) } } - idl->cond_changed = false; - idl->cond_seqno = 0; - ovsdb_idl_track_clear(idl); + db->cond_changed = false; + db->cond_seqno = 0; + ovsdb_idl_db_track_clear(db); if (changed) { - idl->change_seqno++; + db->change_seqno++; } } +static void +ovsdb_idl_clear(struct ovsdb_idl *idl) +{ + ovsdb_idl_db_clear(&idl->db); +} + +static void +ovsdb_idl_send_request(struct ovsdb_idl *idl, struct jsonrpc_msg *request) +{ + json_destroy(idl->request_id); + idl->request_id = json_clone(request->id); + jsonrpc_session_send(idl->session, request); +} + /* Processes a batch of messages from the database server on 'idl'. This may * cause the IDL's contents to change. The client may check for that with * ovsdb_idl_get_seqno(). */ @@ -444,7 +487,7 @@ ovsdb_idl_run(struct ovsdb_idl *idl) { int i; - ovs_assert(!idl->txn); + ovs_assert(!idl->db.txn); ovsdb_idl_send_cond_change(idl); @@ -460,10 +503,11 @@ ovsdb_idl_run(struct ovsdb_idl *idl) idl->request_id = NULL; ovsdb_idl_txn_abort_all(idl); - ovsdb_idl_send_schema_request(idl); + ovsdb_idl_send_schema_request(idl, &idl->db); idl->state = IDL_S_SCHEMA_REQUESTED; - if (idl->lock_name) { - ovsdb_idl_send_lock_request(idl); + if (idl->db.lock_name) { + jsonrpc_session_send( + idl->session, ovsdb_idl_db_compose_lock_request(&idl->db)); } } @@ -472,14 +516,8 @@ ovsdb_idl_run(struct ovsdb_idl *idl) break; } - if (msg->type == JSONRPC_NOTIFY - && !strcmp(msg->method, "update2") - && msg->params->type == JSON_ARRAY - && msg->params->u.array.n == 2 - && msg->params->u.array.elems[0]->type == JSON_STRING) { - /* Database contents changed. */ - ovsdb_idl_parse_update(idl, msg->params->u.array.elems[1], - OVSDB_UPDATE2); + if (ovsdb_idl_db_parse_update_rpc(&idl->db, msg)) { + /* ovsdb_idl_db_parse_update_rpc() did all the processing. */ } else if (msg->type == JSONRPC_REPLY && idl->request_id && json_equal(idl->request_id, msg->id)) { @@ -489,35 +527,35 @@ ovsdb_idl_run(struct ovsdb_idl *idl) switch (idl->state) { case IDL_S_SCHEMA_REQUESTED: /* Reply to our "get_schema" request. */ - idl->schema = json_clone(msg->result); - ovsdb_idl_send_monitor_cond_request(idl); + idl->db.schema = json_clone(msg->result); + ovsdb_idl_send_monitor_request(idl, &idl->db, true); idl->state = IDL_S_MONITOR_COND_REQUESTED; break; case IDL_S_MONITOR_REQUESTED: case IDL_S_MONITOR_COND_REQUESTED: /* Reply to our "monitor" or "monitor_cond" request. */ - idl->change_seqno++; - ovsdb_idl_clear(idl); if (idl->state == IDL_S_MONITOR_REQUESTED) { idl->state = IDL_S_MONITORING; - ovsdb_idl_parse_update(idl, msg->result, OVSDB_UPDATE); + ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, + false); } else { /* IDL_S_MONITOR_COND_REQUESTED. */ idl->state = IDL_S_MONITORING_COND; - ovsdb_idl_parse_update(idl, msg->result, OVSDB_UPDATE2); + ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, + true); } /* Schema is not useful after monitor request is accepted * by the server. */ - json_destroy(idl->schema); - idl->schema = NULL; + json_destroy(idl->db.schema); + idl->db.schema = NULL; break; case IDL_S_MONITORING_COND: /* Conditional monitor clauses were updated. Send out * the next condition changes, in any, immediately. */ ovsdb_idl_send_cond_change(idl); - idl->cond_seqno++; + idl->db.cond_seqno++; break; case IDL_S_MONITORING: @@ -525,27 +563,8 @@ ovsdb_idl_run(struct ovsdb_idl *idl) default: OVS_NOT_REACHED(); } - } else if (msg->type == JSONRPC_NOTIFY - && !strcmp(msg->method, "update") - && msg->params->type == JSON_ARRAY - && msg->params->u.array.n == 2 - && msg->params->u.array.elems[0]->type == JSON_STRING) { - /* Database contents changed. */ - ovsdb_idl_parse_update(idl, msg->params->u.array.elems[1], - OVSDB_UPDATE); - } else if (msg->type == JSONRPC_REPLY - && idl->lock_request_id - && json_equal(idl->lock_request_id, msg->id)) { - /* Reply to our "lock" request. */ - ovsdb_idl_parse_lock_reply(idl, msg->result); - } else if (msg->type == JSONRPC_NOTIFY - && !strcmp(msg->method, "locked")) { - /* We got our lock. */ - ovsdb_idl_parse_lock_notify(idl, msg->params, true); - } else if (msg->type == JSONRPC_NOTIFY - && !strcmp(msg->method, "stolen")) { - /* Someone else stole our lock. */ - ovsdb_idl_parse_lock_notify(idl, msg->params, false); + } else if (ovsdb_idl_db_process_lock_replies(&idl->db, msg)) { + /* ovsdb_idl_db_process_lock_replies() did all the processing. */ } else if (msg->type == JSONRPC_ERROR && idl->state == IDL_S_MONITOR_COND_REQUESTED && idl->request_id @@ -555,7 +574,7 @@ ovsdb_idl_run(struct ovsdb_idl *idl) /* Fall back to using "monitor" method. */ json_destroy(idl->request_id); idl->request_id = NULL; - ovsdb_idl_send_monitor_request(idl); + ovsdb_idl_send_monitor_request(idl, &idl->db, false); idl->state = IDL_S_MONITOR_REQUESTED; } } else if (msg->type == JSONRPC_ERROR @@ -578,7 +597,7 @@ ovsdb_idl_run(struct ovsdb_idl *idl) idl->state = IDL_S_NO_SCHEMA; } else if ((msg->type == JSONRPC_ERROR || msg->type == JSONRPC_REPLY) - && ovsdb_idl_txn_process_reply(idl, msg)) { + && ovsdb_idl_db_txn_process_reply(&idl->db, msg)) { /* ovsdb_idl_txn_process_reply() did everything needful. */ } else { /* This can happen if ovsdb_idl_txn_destroy() is called to destroy @@ -590,7 +609,7 @@ ovsdb_idl_run(struct ovsdb_idl *idl) } jsonrpc_msg_destroy(msg); } - ovsdb_idl_row_destroy_postprocess(idl); + ovsdb_idl_row_destroy_postprocess(&idl->db); } /* Arranges for poll_block() to wake up when ovsdb_idl_run() has something to @@ -622,7 +641,7 @@ ovsdb_idl_wait(struct ovsdb_idl *idl) unsigned int ovsdb_idl_get_seqno(const struct ovsdb_idl *idl) { - return idl->change_seqno; + return idl->db.change_seqno; } /* Returns a "sequence number" that represents the number of conditional @@ -642,7 +661,7 @@ ovsdb_idl_get_seqno(const struct ovsdb_idl *idl) unsigned int ovsdb_idl_get_condition_seqno(const struct ovsdb_idl *idl) { - return idl->cond_seqno; + return idl->db.cond_seqno; } /* Returns true if 'idl' successfully connected to the remote database and @@ -682,7 +701,7 @@ ovsdb_idl_force_reconnect(struct ovsdb_idl *idl) void ovsdb_idl_verify_write_only(struct ovsdb_idl *idl) { - idl->verify_write_only = true; + idl->db.verify_write_only = true; } /* Returns true if 'idl' is currently connected or trying to connect @@ -792,7 +811,7 @@ void ovsdb_idl_check_consistency(const struct ovsdb_idl *idl) { /* Consistency is broken while a transaction is in progress. */ - if (!idl->txn) { + if (!idl->db.txn) { return; } @@ -801,8 +820,8 @@ ovsdb_idl_check_consistency(const struct ovsdb_idl *idl) struct uuid *dsts = NULL; size_t allocated_dsts = 0; - for (size_t i = 0; i < idl->class_->n_tables; i++) { - const struct ovsdb_idl_table *table = &idl->tables[i]; + for (size_t i = 0; i < idl->db.class_->n_tables; i++) { + const struct ovsdb_idl_table *table = &idl->db.tables[i]; const struct ovsdb_idl_table_class *class = table->class_; const struct ovsdb_idl_row *row; @@ -849,7 +868,7 @@ ovsdb_idl_check_consistency(const struct ovsdb_idl *idl) const struct ovsdb_idl_class * ovsdb_idl_get_class(const struct ovsdb_idl *idl) { - return idl->class_; + return idl->db.class_; } /* Given 'column' in some table in 'class', returns the table's class. */ @@ -867,44 +886,52 @@ ovsdb_idl_table_class_from_column(const struct ovsdb_idl_class *class, OVS_NOT_REACHED(); } -/* Given 'column' in some table in 'idl', returns the table. */ +/* Given 'column' in some table in 'db', returns the table. */ static struct ovsdb_idl_table * -ovsdb_idl_table_from_column(struct ovsdb_idl *idl, +ovsdb_idl_table_from_column(struct ovsdb_idl_db *db, const struct ovsdb_idl_column *column) { const struct ovsdb_idl_table_class *tc = - ovsdb_idl_table_class_from_column(idl->class_, column); - return &idl->tables[tc - idl->class_->tables]; + ovsdb_idl_table_class_from_column(db->class_, column); + return &db->tables[tc - db->class_->tables]; } static unsigned char * -ovsdb_idl_get_mode(struct ovsdb_idl *idl, - const struct ovsdb_idl_column *column) +ovsdb_idl_db_get_mode(struct ovsdb_idl_db *db, + const struct ovsdb_idl_column *column) { - ovs_assert(!idl->change_seqno); + ovs_assert(!db->change_seqno); - const struct ovsdb_idl_table *table = ovsdb_idl_table_from_column(idl, + const struct ovsdb_idl_table *table = ovsdb_idl_table_from_column(db, column); return &table->modes[column - table->class_->columns]; } static void -add_ref_table(struct ovsdb_idl *idl, const struct ovsdb_base_type *base) +add_ref_table(struct ovsdb_idl_db *db, const struct ovsdb_base_type *base) { if (base->type == OVSDB_TYPE_UUID && base->u.uuid.refTableName) { struct ovsdb_idl_table *table; - table = shash_find_data(&idl->table_by_name, - base->u.uuid.refTableName); + table = shash_find_data(&db->table_by_name, base->u.uuid.refTableName); if (table) { table->need_table = true; } else { VLOG_WARN("%s IDL class missing referenced table %s", - idl->class_->database, base->u.uuid.refTableName); + db->class_->database, base->u.uuid.refTableName); } } } +static void +ovsdb_idl_db_add_column(struct ovsdb_idl_db *db, + const struct ovsdb_idl_column *column) +{ + *ovsdb_idl_db_get_mode(db, column) = OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT; + add_ref_table(db, &column->type.key); + add_ref_table(db, &column->type.value); +} + /* Turns on OVSDB_IDL_MONITOR and OVSDB_IDL_ALERT for 'column' in 'idl'. Also * ensures that any tables referenced by 'column' will be replicated, even if * no columns in that table are selected for replication (see @@ -918,9 +945,25 @@ void ovsdb_idl_add_column(struct ovsdb_idl *idl, const struct ovsdb_idl_column *column) { - *ovsdb_idl_get_mode(idl, column) = OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT; - add_ref_table(idl, &column->type.key); - add_ref_table(idl, &column->type.value); + ovsdb_idl_db_add_column(&idl->db, column); +} + +static void +ovsdb_idl_db_add_table(struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *tc) +{ + size_t i; + + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; + + if (table->class_ == tc) { + table->need_table = true; + return; + } + } + + OVS_NOT_REACHED(); } /* Ensures that the table with class 'tc' will be replicated on 'idl' even if @@ -940,18 +983,7 @@ void ovsdb_idl_add_table(struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *tc) { - size_t i; - - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; - - if (table->class_ == tc) { - table->need_table = true; - return; - } - } - - OVS_NOT_REACHED(); + ovsdb_idl_db_add_table(&idl->db, tc); } /* A single clause within an ovsdb_idl_condition. */ @@ -1147,6 +1179,24 @@ ovsdb_idl_condition_clone(struct ovsdb_idl_condition *dst, } } +static unsigned int +ovsdb_idl_db_set_condition(struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *tc, + const struct ovsdb_idl_condition *condition) +{ + struct ovsdb_idl_table *table = ovsdb_idl_db_table_from_class(db, tc); + unsigned int seqno = db->cond_seqno; + if (!ovsdb_idl_condition_equals(condition, &table->condition)) { + ovsdb_idl_condition_destroy(&table->condition); + ovsdb_idl_condition_clone(&table->condition, condition); + db->cond_changed = table->cond_changed = true; + poll_immediate_wake(); + return seqno + 1; + } + + return seqno; +} + /* Sets the replication condition for 'tc' in 'idl' to 'condition' and * arranges to send the new condition to the database server. * @@ -1158,17 +1208,7 @@ ovsdb_idl_set_condition(struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *tc, const struct ovsdb_idl_condition *condition) { - struct ovsdb_idl_table *table = ovsdb_idl_table_from_class(idl, tc); - unsigned int seqno = idl->cond_seqno; - if (!ovsdb_idl_condition_equals(condition, &table->condition)) { - ovsdb_idl_condition_destroy(&table->condition); - ovsdb_idl_condition_clone(&table->condition, condition); - idl->cond_changed = table->cond_changed = true; - poll_immediate_wake(); - return seqno + 1; - } - - return seqno; + return ovsdb_idl_db_set_condition(&idl->db, tc, condition); } static struct json * @@ -1205,25 +1245,16 @@ ovsdb_idl_create_cond_change_req(struct ovsdb_idl_table *table) return monitor_cond_change_request; } -static void -ovsdb_idl_send_cond_change(struct ovsdb_idl *idl) +static struct jsonrpc_msg * +ovsdb_idl_db_compose_cond_change(struct ovsdb_idl_db *db) { - int i; - struct json *params; - struct jsonrpc_msg *request; - - /* When 'idl-request_id' is not NULL, there is an outstanding - * conditional monitoring update request that we have not heard - * from the server yet. Don't generate another request in this case. */ - if (!idl->cond_changed || !jsonrpc_session_is_connected(idl->session) || - idl->state != IDL_S_MONITORING_COND || idl->request_id) { - return; + if (!db->cond_changed) { + return NULL; } struct json *monitor_cond_change_requests = NULL; - - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (size_t i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; if (table->cond_changed) { struct json *req = ovsdb_idl_create_cond_change_req(table); @@ -1239,17 +1270,48 @@ ovsdb_idl_send_cond_change(struct ovsdb_idl *idl) } } - /* Send request if not empty. */ - if (monitor_cond_change_requests) { - params = json_array_create_3(json_string_create("monid"), - json_string_create("monid"), - monitor_cond_change_requests); + if (!monitor_cond_change_requests) { + return NULL; + } + + db->cond_changed = false; + struct json *params = json_array_create_3(json_clone(db->monitor_id), + json_clone(db->monitor_id), + monitor_cond_change_requests); + return jsonrpc_create_request("monitor_cond_change", params, NULL); +} + +static void +ovsdb_idl_send_cond_change(struct ovsdb_idl *idl) +{ + /* When 'idl->request_id' is not NULL, there is an outstanding + * conditional monitoring update request that we have not heard + * from the server yet. Don't generate another request in this case. + * + * XXX per-db request_id */ + if (!jsonrpc_session_is_connected(idl->session) + || idl->state != IDL_S_MONITORING_COND + || idl->request_id) { + return; + } - request = jsonrpc_create_request("monitor_cond_change", params, - &idl->request_id); - jsonrpc_session_send(idl->session, request); + struct jsonrpc_msg *msg = ovsdb_idl_db_compose_cond_change(&idl->db); + if (msg) { + idl->request_id = json_clone(msg->id); + jsonrpc_session_send(idl->session, msg); } - idl->cond_changed = false; +} + +/* Turns off OVSDB_IDL_ALERT for 'column' in 'idl'. + * + * This function should be called between ovsdb_idl_create() and the first call + * to ovsdb_idl_run(). + */ +static void +ovsdb_idl_db_omit_alert(struct ovsdb_idl_db *db, + const struct ovsdb_idl_column *column) +{ + *ovsdb_idl_db_get_mode(db, column) &= ~OVSDB_IDL_ALERT; } /* Turns off OVSDB_IDL_ALERT for 'column' in 'idl'. @@ -1261,7 +1323,14 @@ void ovsdb_idl_omit_alert(struct ovsdb_idl *idl, const struct ovsdb_idl_column *column) { - *ovsdb_idl_get_mode(idl, column) &= ~OVSDB_IDL_ALERT; + ovsdb_idl_db_omit_alert(&idl->db, column); +} + +static void +ovsdb_idl_db_omit(struct ovsdb_idl_db *db, + const struct ovsdb_idl_column *column) +{ + *ovsdb_idl_db_get_mode(db, column) = 0; } /* Sets the mode for 'column' in 'idl' to 0. See the big comment above @@ -1273,7 +1342,7 @@ ovsdb_idl_omit_alert(struct ovsdb_idl *idl, void ovsdb_idl_omit(struct ovsdb_idl *idl, const struct ovsdb_idl_column *column) { - *ovsdb_idl_get_mode(idl, column) = 0; + ovsdb_idl_db_omit(&idl->db, column); } /* Returns the most recent IDL change sequence number that caused a @@ -1284,7 +1353,7 @@ ovsdb_idl_table_get_seqno(const struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *table_class) { struct ovsdb_idl_table *table - = ovsdb_idl_table_from_class(idl, table_class); + = ovsdb_idl_db_table_from_class(&idl->db, table_class); unsigned int max_seqno = table->change_seqno[OVSDB_IDL_CHANGE_INSERT]; if (max_seqno < table->change_seqno[OVSDB_IDL_CHANGE_MODIFY]) { @@ -1321,10 +1390,10 @@ void ovsdb_idl_track_add_column(struct ovsdb_idl *idl, const struct ovsdb_idl_column *column) { - if (!(*ovsdb_idl_get_mode(idl, column) & OVSDB_IDL_ALERT)) { + if (!(*ovsdb_idl_db_get_mode(&idl->db, column) & OVSDB_IDL_ALERT)) { ovsdb_idl_add_column(idl, column); } - *ovsdb_idl_get_mode(idl, column) |= OVSDB_IDL_TRACK; + *ovsdb_idl_db_get_mode(&idl->db, column) |= OVSDB_IDL_TRACK; } void @@ -1332,8 +1401,8 @@ ovsdb_idl_track_add_all(struct ovsdb_idl *idl) { size_t i, j; - for (i = 0; i < idl->class_->n_tables; i++) { - const struct ovsdb_idl_table_class *tc = &idl->class_->tables[i]; + for (i = 0; i < idl->db.class_->n_tables; i++) { + const struct ovsdb_idl_table_class *tc = &idl->db.class_->tables[i]; for (j = 0; j < tc->n_columns; j++) { const struct ovsdb_idl_column *column = &tc->columns[j]; @@ -1363,7 +1432,7 @@ ovsdb_idl_track_get_first(const struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *table_class) { struct ovsdb_idl_table *table - = ovsdb_idl_table_from_class(idl, table_class); + = ovsdb_idl_db_table_from_class(&idl->db, table_class); if (!ovs_list_is_empty(&table->track_list)) { return CONTAINER_OF(ovs_list_front(&table->track_list), struct ovsdb_idl_row, track_node); @@ -1411,13 +1480,13 @@ ovsdb_idl_track_is_updated(const struct ovsdb_idl_row *row, * functions. This is usually done at the end of the client's processing * loop when it is ready to do ovsdb_idl_run() again. */ -void -ovsdb_idl_track_clear(const struct ovsdb_idl *idl) +static void +ovsdb_idl_db_track_clear(struct ovsdb_idl_db *db) { size_t i; - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; if (!ovs_list_is_empty(&table->track_list)) { struct ovsdb_idl_row *row, *next; @@ -1438,20 +1507,27 @@ ovsdb_idl_track_clear(const struct ovsdb_idl *idl) } } +/* Flushes the tracked rows. Client calls this function after calling + * ovsdb_idl_run() and read all tracked rows with the ovsdb_idl_track_get_*() + * functions. This is usually done at the end of the client's processing + * loop when it is ready to do ovsdb_idl_run() again. + */ +void +ovsdb_idl_track_clear(struct ovsdb_idl *idl) +{ + ovsdb_idl_db_track_clear(&idl->db); +} static void -ovsdb_idl_send_schema_request(struct ovsdb_idl *idl) +ovsdb_idl_send_schema_request(struct ovsdb_idl *idl, + struct ovsdb_idl_db *db) { - struct jsonrpc_msg *msg; - - json_destroy(idl->request_id); - msg = jsonrpc_create_request( - "get_schema", - json_array_create_1(json_string_create(idl->class_->database)), - &idl->request_id); - jsonrpc_session_send(idl->session, msg); + ovsdb_idl_send_request(idl, jsonrpc_create_request( + "get_schema", + json_array_create_1(json_string_create( + db->class_->database)), + NULL)); } - static void log_error(struct ovsdb_error *error) { @@ -1536,36 +1612,29 @@ parse_schema(const struct json *schema_json) } static void -ovsdb_idl_send_monitor_request__(struct ovsdb_idl *idl, - const char *method) +ovsdb_idl_send_monitor_request(struct ovsdb_idl *idl, struct ovsdb_idl_db *db, + bool use_monitor_cond) { - struct shash *schema; - struct json *monitor_requests; - struct jsonrpc_msg *msg; - size_t i; + struct shash *schema = parse_schema(db->schema); + struct json *monitor_requests = json_object_create(); - schema = parse_schema(idl->schema); - monitor_requests = json_object_create(); - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (size_t i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; const struct ovsdb_idl_table_class *tc = table->class_; - struct json *monitor_request, *columns, *where; - const struct sset *table_schema; - size_t j; - - table_schema = (schema - ? shash_find_data(schema, table->class_->name) - : NULL); + struct json *monitor_request; + const struct sset *table_schema + = schema ? shash_find_data(schema, table->class_->name) : NULL; - columns = table->need_table ? json_array_create_empty() : NULL; - for (j = 0; j < tc->n_columns; j++) { + struct json *columns + = table->need_table ? json_array_create_empty() : NULL; + for (size_t j = 0; j < tc->n_columns; j++) { const struct ovsdb_idl_column *column = &tc->columns[j]; if (table->modes[j] & OVSDB_IDL_MONITOR) { if (table_schema && !sset_contains(table_schema, column->name)) { VLOG_WARN("%s table in %s database lacks %s column " "(database needs upgrade?)", - table->class_->name, idl->class_->database, + table->class_->name, db->class_->database, column->name); continue; } @@ -1580,17 +1649,18 @@ ovsdb_idl_send_monitor_request__(struct ovsdb_idl *idl, if (schema && !table_schema) { VLOG_WARN("%s database lacks %s table " "(database needs upgrade?)", - idl->class_->database, table->class_->name); + db->class_->database, table->class_->name); json_destroy(columns); continue; } monitor_request = json_object_create(); json_object_put(monitor_request, "columns", columns); - if (!strcmp(method, "monitor_cond") - && !ovsdb_idl_condition_is_true(&table->condition)) { - where = ovsdb_idl_condition_to_json(&table->condition); - json_object_put(monitor_request, "where", where); + + const struct ovsdb_idl_condition *cond = &table->condition; + if (use_monitor_cond && !ovsdb_idl_condition_is_true(cond)) { + json_object_put(monitor_request, "where", + ovsdb_idl_condition_to_json(cond)); table->cond_changed = false; } json_object_put(monitor_requests, tc->name, monitor_request); @@ -1598,21 +1668,15 @@ ovsdb_idl_send_monitor_request__(struct ovsdb_idl *idl, } free_schema(schema); - json_destroy(idl->request_id); + db->cond_changed = false; - msg = jsonrpc_create_request( - method, - json_array_create_3(json_string_create(idl->class_->database), - json_string_create("monid"), monitor_requests), - &idl->request_id); - jsonrpc_session_send(idl->session, msg); - idl->cond_changed = false; -} - -static void -ovsdb_idl_send_monitor_request(struct ovsdb_idl *idl) -{ - ovsdb_idl_send_monitor_request__(idl, "monitor"); + ovsdb_idl_send_request( + idl, + jsonrpc_create_request( + use_monitor_cond ? "monitor_cond" : "monitor", + json_array_create_3(json_string_create(db->class_->database), + json_clone(db->monitor_id), monitor_requests), + NULL)); } static void @@ -1627,36 +1691,46 @@ log_parse_update_error(struct ovsdb_error *error) } static void -ovsdb_idl_send_monitor_cond_request(struct ovsdb_idl *idl) +ovsdb_idl_db_parse_monitor_reply(struct ovsdb_idl_db *db, + const struct json *result, + bool is_monitor_cond) { - ovsdb_idl_send_monitor_request__(idl, "monitor_cond"); + db->change_seqno++; + ovsdb_idl_db_clear(db); + ovsdb_idl_db_parse_update(db, result, is_monitor_cond); } -static void -ovsdb_idl_parse_update(struct ovsdb_idl *idl, const struct json *table_updates, - enum ovsdb_update_version version) +static bool +ovsdb_idl_db_parse_update_rpc(struct ovsdb_idl_db *db, + const struct jsonrpc_msg *msg) { - struct ovsdb_error *error = ovsdb_idl_parse_update__(idl, table_updates, - version); - if (error) { - log_parse_update_error(error); + if (msg->type == JSONRPC_NOTIFY) { + bool is_update = !strcmp(msg->method, "update"); + bool is_update2 = !strcmp(msg->method, "update2"); + if ((is_update || is_update2) + && msg->params->type == JSON_ARRAY + && msg->params->u.array.n == 2 + && json_equal(msg->params->u.array.elems[0], db->monitor_id)) { + ovsdb_idl_db_parse_update(db, msg->params->u.array.elems[1], + is_update2); + return true; + } } + return false; } static struct ovsdb_error * -ovsdb_idl_parse_update__(struct ovsdb_idl *idl, - const struct json *table_updates, - enum ovsdb_update_version version) +ovsdb_idl_db_parse_update__(struct ovsdb_idl_db *db, + const struct json *table_updates, + bool is_monitor_cond) { const struct shash_node *tables_node; - const char *table_updates_name = table_updates_names[version]; - const char *table_update_name = table_update_names[version]; - const char *row_update_name = row_update_names[version]; + const char *version_suffix = is_monitor_cond ? "2" : ""; if (table_updates->type != JSON_OBJECT) { return ovsdb_syntax_error(table_updates, NULL, - "<%s> is not an object", - table_updates_name); + " is not an object", + version_suffix); } SHASH_FOR_EACH (tables_node, json_object(table_updates)) { @@ -1664,75 +1738,43 @@ ovsdb_idl_parse_update__(struct ovsdb_idl *idl, const struct shash_node *table_node; struct ovsdb_idl_table *table; - table = shash_find_data(&idl->table_by_name, tables_node->name); + table = shash_find_data(&db->table_by_name, tables_node->name); if (!table) { return ovsdb_syntax_error( table_updates, NULL, - "<%s> includes unknown table \"%s\"", - table_updates_name, - tables_node->name); + " includes unknown table \"%s\"", + version_suffix, tables_node->name); } if (table_update->type != JSON_OBJECT) { return ovsdb_syntax_error(table_update, NULL, - "<%s> for table \"%s\" is " + " for table \"%s\" is " "not an object", - table_update_name, - table->class_->name); + version_suffix, table->class_->name); } SHASH_FOR_EACH (table_node, json_object(table_update)) { const struct json *row_update = table_node->data; - const struct json *old_json, *new_json; struct uuid uuid; if (!uuid_from_string(&uuid, table_node->name)) { return ovsdb_syntax_error(table_update, NULL, - "<%s> for table \"%s\" " + " for table \"%s\" " "contains bad UUID " "\"%s\" as member name", - table_update_name, + version_suffix, table->class_->name, table_node->name); } if (row_update->type != JSON_OBJECT) { return ovsdb_syntax_error(row_update, NULL, - "<%s> for table \"%s\" " - "contains <%s> for %s that " - "is not an object", - table_update_name, - table->class_->name, - row_update_name, - table_node->name); + " for table \"%s\" " + "contains for %s " + "that is not an object", + version_suffix, table->class_->name, + version_suffix, table_node->name); } - switch(version) { - case OVSDB_UPDATE: - old_json = shash_find_data(json_object(row_update), "old"); - new_json = shash_find_data(json_object(row_update), "new"); - if (old_json && old_json->type != JSON_OBJECT) { - return ovsdb_syntax_error(old_json, NULL, - "\"old\" is not object"); - } else if (new_json && new_json->type != JSON_OBJECT) { - return ovsdb_syntax_error(new_json, NULL, - "\"new\" is not object"); - } else if ((old_json != NULL) + (new_json != NULL) - != shash_count(json_object(row_update))) { - return ovsdb_syntax_error(row_update, NULL, - " contains " - "unexpected member"); - } else if (!old_json && !new_json) { - return ovsdb_syntax_error(row_update, NULL, - " missing \"old\" " - "and \"new\" members"); - } - - if (ovsdb_idl_process_update(table, &uuid, old_json, - new_json)) { - idl->change_seqno++; - } - break; - - case OVSDB_UPDATE2: { + if (is_monitor_cond) { const char *ops[] = {"modify", "insert", "delete", "initial"}; const char *operation; const struct json *row; @@ -1745,7 +1787,7 @@ ovsdb_idl_parse_update__(struct ovsdb_idl *idl, if (row) { if (ovsdb_idl_process_update2(table, &uuid, operation, row)) { - idl->change_seqno++; + db->change_seqno++; } break; } @@ -1757,11 +1799,32 @@ ovsdb_idl_parse_update__(struct ovsdb_idl *idl, " includes unknown " "object"); } - break; - } + } else { + const struct json *old_json, *new_json; - default: - OVS_NOT_REACHED(); + old_json = shash_find_data(json_object(row_update), "old"); + new_json = shash_find_data(json_object(row_update), "new"); + if (old_json && old_json->type != JSON_OBJECT) { + return ovsdb_syntax_error(old_json, NULL, + "\"old\" is not object"); + } else if (new_json && new_json->type != JSON_OBJECT) { + return ovsdb_syntax_error(new_json, NULL, + "\"new\" is not object"); + } else if ((old_json != NULL) + (new_json != NULL) + != shash_count(json_object(row_update))) { + return ovsdb_syntax_error(row_update, NULL, + " contains " + "unexpected member"); + } else if (!old_json && !new_json) { + return ovsdb_syntax_error(row_update, NULL, + " missing \"old\" " + "and \"new\" members"); + } + + if (ovsdb_idl_process_update(table, &uuid, old_json, + new_json)) { + db->change_seqno++; + } } } } @@ -1769,6 +1832,18 @@ ovsdb_idl_parse_update__(struct ovsdb_idl *idl, return NULL; } +static void +ovsdb_idl_db_parse_update(struct ovsdb_idl_db *db, + const struct json *table_updates, + bool is_monitor_cond) +{ + struct ovsdb_error *error = ovsdb_idl_db_parse_update__(db, table_updates, + is_monitor_cond); + if (error) { + log_parse_update_error(error); + } +} + static struct ovsdb_idl_row * ovsdb_idl_get_row(struct ovsdb_idl_table *table, const struct uuid *uuid) { @@ -1955,7 +2030,7 @@ ovsdb_idl_row_change__(struct ovsdb_idl_row *row, const struct json *row_json, changed = true; row->change_seqno[change] = row->table->change_seqno[change] - = row->table->idl->change_seqno + 1; + = row->table->db->change_seqno + 1; if (table->modes[column_idx] & OVSDB_IDL_TRACK) { if (!ovs_list_is_empty(&row->track_node)) { ovs_list_remove(&row->track_node); @@ -2067,20 +2142,16 @@ ovsdb_idl_row_unparse(struct ovsdb_idl_row *row) * iterate over a subset of rows in a defined order. */ -/* Creates a new index with the provided name, attached to the given idl and - * table. Note that all indexes must be created and indexing columns added - * before the first call to ovsdb_idl_run() is made. - */ -struct ovsdb_idl_index * -ovsdb_idl_create_index(struct ovsdb_idl *idl, - const struct ovsdb_idl_table_class *tc, - const char *index_name) +static struct ovsdb_idl_index * +ovsdb_idl_db_create_index(struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *tc, + const char *index_name) { struct ovsdb_idl_index *index; size_t i; - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; if (table->class_ == tc) { index = ovsdb_idl_create_index_(table, 1); @@ -2097,6 +2168,18 @@ ovsdb_idl_create_index(struct ovsdb_idl *idl, return NULL; } +/* Creates a new index with the provided name, attached to the given idl and + * table. Note that all indexes must be created and indexing columns added + * before the first call to ovsdb_idl_run() is made. + */ +struct ovsdb_idl_index * +ovsdb_idl_create_index(struct ovsdb_idl *idl, + const struct ovsdb_idl_table_class *tc, + const char *index_name) +{ + return ovsdb_idl_db_create_index(&idl->db, tc, index_name); +} + /* Generic comparator that can compare each index, using the custom * configuration (an struct ovsdb_idl_index) passed to it. * Not intended for direct usage. @@ -2233,7 +2316,7 @@ ovsdb_idl_index_add_column(struct ovsdb_idl_index *index, /* Check that the column or table is tracked */ if (!index->table->need_table && !((OVSDB_IDL_MONITOR | OVSDB_IDL_ALERT) & - *ovsdb_idl_get_mode(index->table->idl, column))) { + *ovsdb_idl_db_get_mode(index->table->db, column))) { VLOG_ERR("Can't add unmonitored column '%s' at index '%s' in " "table '%s'.", column->name, index->index_name, index->table->class_->name); @@ -2263,16 +2346,16 @@ ovsdb_idl_index_add_column(struct ovsdb_idl_index *index, index->n_columns++; } -bool -ovsdb_idl_initialize_cursor(struct ovsdb_idl *idl, - const struct ovsdb_idl_table_class *tc, - const char *index_name, - struct ovsdb_idl_index_cursor *cursor) +static bool +ovsdb_idl_db_initialize_cursor(struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *tc, + const char *index_name, + struct ovsdb_idl_index_cursor *cursor) { size_t i; - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; if (table->class_ == tc) { struct shash_node *node = shash_find(&table->indexes, index_name); @@ -2295,6 +2378,15 @@ ovsdb_idl_initialize_cursor(struct ovsdb_idl *idl, return false; } +bool +ovsdb_idl_initialize_cursor(struct ovsdb_idl *idl, + const struct ovsdb_idl_table_class *tc, + const char *index_name, + struct ovsdb_idl_index_cursor *cursor) +{ + return ovsdb_idl_db_initialize_cursor(&idl->db, tc, index_name, cursor); +} + /* ovsdb_idl_index_write_ writes a datum in an ovsdb_idl_row, * and updates the corresponding field in the table record. * Not intended for direct usage. @@ -2587,7 +2679,7 @@ ovsdb_idl_row_destroy(struct ovsdb_idl_row *row) if (ovsdb_idl_track_is_set(row->table)) { row->change_seqno[OVSDB_IDL_CHANGE_DELETE] = row->table->change_seqno[OVSDB_IDL_CHANGE_DELETE] - = row->table->idl->change_seqno + 1; + = row->table->db->change_seqno + 1; } if (!ovs_list_is_empty(&row->track_node)) { ovs_list_remove(&row->track_node); @@ -2639,12 +2731,12 @@ ovsdb_idl_destroy_all_set_op_lists(struct ovsdb_idl_row *row) } static void -ovsdb_idl_row_destroy_postprocess(struct ovsdb_idl *idl) +ovsdb_idl_row_destroy_postprocess(struct ovsdb_idl_db *db) { size_t i; - for (i = 0; i < idl->class_->n_tables; i++) { - struct ovsdb_idl_table *table = &idl->tables[i]; + for (i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; if (!ovs_list_is_empty(&table->track_list)) { struct ovsdb_idl_row *row, *next; @@ -2748,10 +2840,18 @@ may_add_arc(const struct ovsdb_idl_row *src, const struct ovsdb_idl_row *dst) } static struct ovsdb_idl_table * +ovsdb_idl_db_table_from_class(const struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *table_class) +{ + ptrdiff_t idx = table_class - db->class_->tables; + return idx >= 0 && idx < db->class_->n_tables ? &db->tables[idx] : NULL; +} + +static struct ovsdb_idl_table * ovsdb_idl_table_from_class(const struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *table_class) { - return &idl->tables[table_class - idl->class_->tables]; + return ovsdb_idl_db_table_from_class(&idl->db, table_class); } /* Called by ovsdb-idlc generated code. */ @@ -2760,14 +2860,14 @@ ovsdb_idl_get_row_arc(struct ovsdb_idl_row *src, const struct ovsdb_idl_table_class *dst_table_class, const struct uuid *dst_uuid) { - struct ovsdb_idl *idl = src->table->idl; + struct ovsdb_idl_db *db = src->table->db; struct ovsdb_idl_table *dst_table; struct ovsdb_idl_arc *arc; struct ovsdb_idl_row *dst; - dst_table = ovsdb_idl_table_from_class(idl, dst_table_class); + dst_table = ovsdb_idl_db_table_from_class(db, dst_table_class); dst = ovsdb_idl_get_row(dst_table, dst_uuid); - if (idl->txn || is_index_row(src)) { + if (db->txn || is_index_row(src)) { /* There are two cases we should not update any arcs: * * 1. We're being called from ovsdb_idl_txn_write(). We must not update @@ -2837,8 +2937,8 @@ const struct ovsdb_idl_row * ovsdb_idl_first_row(const struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *table_class) { - struct ovsdb_idl_table *table - = ovsdb_idl_table_from_class(idl, table_class); + struct ovsdb_idl_table *table = ovsdb_idl_table_from_class(idl, + table_class); return next_real_row(table, hmap_first(&table->rows)); } @@ -2972,10 +3072,10 @@ ovsdb_idl_txn_create(struct ovsdb_idl *idl) { struct ovsdb_idl_txn *txn; - ovs_assert(!idl->txn); - idl->txn = txn = xmalloc(sizeof *txn); + ovs_assert(!idl->db.txn); + idl->db.txn = txn = xmalloc(sizeof *txn); txn->request_id = NULL; - txn->idl = idl; + txn->db = &idl->db; hmap_init(&txn->txn_rows); txn->status = TXN_UNCOMMITTED; txn->error = NULL; @@ -3067,7 +3167,7 @@ ovsdb_idl_txn_destroy(struct ovsdb_idl_txn *txn) json_destroy(txn->request_id); if (txn->status == TXN_INCOMPLETE) { - hmap_remove(&txn->idl->outstanding_txns, &txn->hmap_node); + hmap_remove(&txn->db->outstanding_txns, &txn->hmap_node); } ovsdb_idl_txn_abort(txn); ds_destroy(&txn->comment); @@ -3163,7 +3263,7 @@ ovsdb_idl_txn_disassemble(struct ovsdb_idl_txn *txn) * ovsdb_idl_column's 'parse' function, which will call * ovsdb_idl_get_row_arc(), which will seen that the IDL is in a * transaction and fail to update the graph. */ - txn->idl->txn = NULL; + txn->db->txn = NULL; HMAP_FOR_EACH_SAFE (row, next, txn_node, &txn->txn_rows) { ovsdb_idl_destroy_all_map_op_lists(row); @@ -3449,25 +3549,25 @@ ovsdb_idl_txn_commit(struct ovsdb_idl_txn *txn) struct json *operations; bool any_updates; - if (txn != txn->idl->txn) { + if (txn != txn->db->txn) { goto coverage_out; } /* If we need a lock but don't have it, give up quickly. */ - if (txn->idl->lock_name && !ovsdb_idl_has_lock(txn->idl)) { + if (txn->db->lock_name && !txn->db->has_lock) { txn->status = TXN_NOT_LOCKED; goto disassemble_out; } operations = json_array_create_1( - json_string_create(txn->idl->class_->database)); + json_string_create(txn->db->class_->database)); /* Assert that we have the required lock (avoiding a race). */ - if (txn->idl->lock_name) { + if (txn->db->lock_name) { struct json *op = json_object_create(); json_array_add(operations, op); json_object_put_string(op, "op", "assert"); - json_object_put_string(op, "lock", txn->idl->lock_name); + json_object_put_string(op, "lock", txn->db->lock_name); } /* Add prerequisites and declarations of new rows. */ @@ -3670,10 +3770,10 @@ ovsdb_idl_txn_commit(struct ovsdb_idl_txn *txn) txn->status = TXN_UNCHANGED; json_destroy(operations); } else if (!jsonrpc_session_send( - txn->idl->session, + txn->db->idl->session, jsonrpc_create_request( "transact", operations, &txn->request_id))) { - hmap_insert(&txn->idl->outstanding_txns, &txn->hmap_node, + hmap_insert(&txn->db->outstanding_txns, &txn->hmap_node, json_hash(txn->request_id, 0)); txn->status = TXN_INCOMPLETE; } else { @@ -3710,8 +3810,8 @@ ovsdb_idl_txn_commit_block(struct ovsdb_idl_txn *txn) fatal_signal_run(); while ((status = ovsdb_idl_txn_commit(txn)) == TXN_INCOMPLETE) { - ovsdb_idl_run(txn->idl); - ovsdb_idl_wait(txn->idl); + ovsdb_idl_run(txn->db->idl); + ovsdb_idl_wait(txn->db->idl); ovsdb_idl_txn_wait(txn); poll_block(); } @@ -3802,7 +3902,7 @@ ovsdb_idl_txn_complete(struct ovsdb_idl_txn *txn, enum ovsdb_idl_txn_status status) { txn->status = status; - hmap_remove(&txn->idl->outstanding_txns, &txn->hmap_node); + hmap_remove(&txn->db->outstanding_txns, &txn->hmap_node); } static void @@ -3828,7 +3928,7 @@ ovsdb_idl_txn_write__(const struct ovsdb_idl_row *row_, ovs_assert(row->old_datum == NULL || row->table->modes[column_idx] & OVSDB_IDL_MONITOR); - if (row->table->idl->verify_write_only && !write_only) { + if (row->table->db->verify_write_only && !write_only) { VLOG_ERR("Bug: Attempt to write to a read/write column (%s:%s) when" " explicitly configured not to.", class->name, column->name); goto discard_datum; @@ -3851,7 +3951,7 @@ ovsdb_idl_txn_write__(const struct ovsdb_idl_row *row_, } if (hmap_node_is_null(&row->txn_node)) { - hmap_insert(&row->table->idl->txn->txn_rows, &row->txn_node, + hmap_insert(&row->table->db->txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); } if (row->old_datum == row->new_datum) { @@ -3973,7 +4073,7 @@ ovsdb_idl_txn_verify(const struct ovsdb_idl_row *row_, } if (hmap_node_is_null(&row->txn_node)) { - hmap_insert(&row->table->idl->txn->txn_rows, &row->txn_node, + hmap_insert(&row->table->db->txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); } if (!row->prereqs) { @@ -4004,12 +4104,12 @@ ovsdb_idl_txn_delete(const struct ovsdb_idl_row *row_) ovsdb_idl_row_clear_new(row); ovs_assert(!row->prereqs); hmap_remove(&row->table->rows, &row->hmap_node); - hmap_remove(&row->table->idl->txn->txn_rows, &row->txn_node); + hmap_remove(&row->table->db->txn->txn_rows, &row->txn_node); free(row); return; } if (hmap_node_is_null(&row->txn_node)) { - hmap_insert(&row->table->idl->txn->txn_rows, &row->txn_node, + hmap_insert(&row->table->db->txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); } ovsdb_idl_row_clear_new(row); @@ -4042,7 +4142,7 @@ ovsdb_idl_txn_insert(struct ovsdb_idl_txn *txn, uuid_generate(&row->uuid); } - row->table = ovsdb_idl_table_from_class(txn->idl, class); + row->table = ovsdb_idl_db_table_from_class(txn->db, class); row->new_datum = xmalloc(class->n_columns * sizeof *row->new_datum); hmap_insert(&row->table->rows, &row->hmap_node, uuid_hash(&row->uuid)); hmap_insert(&txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); @@ -4054,18 +4154,18 @@ ovsdb_idl_txn_abort_all(struct ovsdb_idl *idl) { struct ovsdb_idl_txn *txn; - HMAP_FOR_EACH (txn, hmap_node, &idl->outstanding_txns) { + HMAP_FOR_EACH (txn, hmap_node, &idl->db.outstanding_txns) { ovsdb_idl_txn_complete(txn, TXN_TRY_AGAIN); } } static struct ovsdb_idl_txn * -ovsdb_idl_txn_find(struct ovsdb_idl *idl, const struct json *id) +ovsdb_idl_db_txn_find(struct ovsdb_idl_db *db, const struct json *id) { struct ovsdb_idl_txn *txn; HMAP_FOR_EACH_WITH_HASH (txn, hmap_node, - json_hash(id, 0), &idl->outstanding_txns) { + json_hash(id, 0), &db->outstanding_txns) { if (json_equal(id, txn->request_id)) { return txn; } @@ -4104,7 +4204,7 @@ ovsdb_idl_txn_process_inc_reply(struct ovsdb_idl_txn *txn, } /* We know that this is a JSON object because the loop in - * ovsdb_idl_txn_process_reply() checked. */ + * ovsdb_idl_db_txn_process_reply() checked. */ mutate = json_object(results->elems[txn->inc_index]); count = shash_find_data(mutate, "count"); if (!check_json_type(count, JSON_INTEGER, "\"mutate\" reply \"count\"")) { @@ -4181,13 +4281,13 @@ ovsdb_idl_txn_process_insert_reply(struct ovsdb_idl_txn_insert *insert, } static bool -ovsdb_idl_txn_process_reply(struct ovsdb_idl *idl, - const struct jsonrpc_msg *msg) +ovsdb_idl_db_txn_process_reply(struct ovsdb_idl_db *db, + const struct jsonrpc_msg *msg) { struct ovsdb_idl_txn *txn; enum ovsdb_idl_txn_status status; - txn = ovsdb_idl_txn_find(idl, msg->id); + txn = ovsdb_idl_db_txn_find(db, msg->id); if (!txn) { return false; } @@ -4274,7 +4374,7 @@ ovsdb_idl_txn_process_reply(struct ovsdb_idl *idl, struct ovsdb_idl_txn * ovsdb_idl_txn_get(const struct ovsdb_idl_row *row) { - struct ovsdb_idl_txn *txn = row->table->idl->txn; + struct ovsdb_idl_txn *txn = row->table->db->txn; ovs_assert(txn != NULL); return txn; } @@ -4283,7 +4383,7 @@ ovsdb_idl_txn_get(const struct ovsdb_idl_row *row) struct ovsdb_idl * ovsdb_idl_txn_get_idl (struct ovsdb_idl_txn *txn) { - return txn->idl; + return txn->db->idl; } /* Blocks until 'idl' successfully connects to the remote database and @@ -4301,6 +4401,31 @@ ovsdb_idl_get_initial_snapshot(struct ovsdb_idl *idl) } } +static struct jsonrpc_msg * +ovsdb_idl_db_set_lock(struct ovsdb_idl_db *db, const char *lock_name) +{ + ovs_assert(!db->txn); + ovs_assert(hmap_is_empty(&db->outstanding_txns)); + + if (db->lock_name + && (!lock_name || strcmp(lock_name, db->lock_name))) { + /* Release previous lock. */ + struct jsonrpc_msg *msg = ovsdb_idl_db_compose_unlock_request(db); + free(db->lock_name); + db->lock_name = NULL; + db->is_lock_contended = false; + return msg; + } + + if (lock_name && !db->lock_name) { + /* Acquire new lock. */ + db->lock_name = xstrdup(lock_name); + return ovsdb_idl_db_compose_lock_request(db); + } + + return NULL; +} + /* If 'lock_name' is nonnull, configures 'idl' to obtain the named lock from * the database server and to avoid modifying the database when the lock cannot * be acquired (that is, when another client has the same lock). @@ -4310,21 +4435,12 @@ ovsdb_idl_get_initial_snapshot(struct ovsdb_idl *idl) void ovsdb_idl_set_lock(struct ovsdb_idl *idl, const char *lock_name) { - ovs_assert(!idl->txn); - ovs_assert(hmap_is_empty(&idl->outstanding_txns)); - - if (idl->lock_name && (!lock_name || strcmp(lock_name, idl->lock_name))) { - /* Release previous lock. */ - ovsdb_idl_send_unlock_request(idl); - free(idl->lock_name); - idl->lock_name = NULL; - idl->is_lock_contended = false; - } - - if (lock_name && !idl->lock_name) { - /* Acquire new lock. */ - idl->lock_name = xstrdup(lock_name); - ovsdb_idl_send_lock_request(idl); + for (;;) { + struct jsonrpc_msg *msg = ovsdb_idl_db_set_lock(&idl->db, lock_name); + if (!msg) { + break; + } + jsonrpc_session_send(idl->session, msg); } } @@ -4337,7 +4453,7 @@ ovsdb_idl_set_lock(struct ovsdb_idl *idl, const char *lock_name) bool ovsdb_idl_has_lock(const struct ovsdb_idl *idl) { - return idl->has_lock; + return idl->db.has_lock; } /* Returns true if 'idl' is configured to obtain a lock but the database server @@ -4345,63 +4461,87 @@ ovsdb_idl_has_lock(const struct ovsdb_idl *idl) bool ovsdb_idl_is_lock_contended(const struct ovsdb_idl *idl) { - return idl->is_lock_contended; + return idl->db.is_lock_contended; } static void -ovsdb_idl_update_has_lock(struct ovsdb_idl *idl, bool new_has_lock) +ovsdb_idl_db_update_has_lock(struct ovsdb_idl_db *db, bool new_has_lock) { - if (new_has_lock && !idl->has_lock) { - if (idl->state == IDL_S_MONITORING || - idl->state == IDL_S_MONITORING_COND) { - idl->change_seqno++; + if (new_has_lock && !db->has_lock) { + if (db->idl->state == IDL_S_MONITORING || + db->idl->state == IDL_S_MONITORING_COND) { + db->change_seqno++; } else { /* We're setting up a session, so don't signal that the database * changed. Finalizing the session will increment change_seqno * anyhow. */ } - idl->is_lock_contended = false; + db->is_lock_contended = false; } - idl->has_lock = new_has_lock; + db->has_lock = new_has_lock; } -static void -ovsdb_idl_send_lock_request__(struct ovsdb_idl *idl, const char *method, - struct json **idp) -{ - ovsdb_idl_update_has_lock(idl, false); +static bool +ovsdb_idl_db_process_lock_replies(struct ovsdb_idl_db *db, + const struct jsonrpc_msg *msg) +{ + if (msg->type == JSONRPC_REPLY + && db->lock_request_id + && json_equal(db->lock_request_id, msg->id)) { + /* Reply to our "lock" request. */ + ovsdb_idl_db_parse_lock_reply(db, msg->result); + return true; + } - json_destroy(idl->lock_request_id); - idl->lock_request_id = NULL; + if (msg->type == JSONRPC_NOTIFY) { + if (!strcmp(msg->method, "locked")) { + /* We got our lock. */ + return ovsdb_idl_db_parse_lock_notify(db, msg->params, true); + } else if (!strcmp(msg->method, "stolen")) { + /* Someone else stole our lock. */ + return ovsdb_idl_db_parse_lock_notify(db, msg->params, false); + } + } - if (jsonrpc_session_is_connected(idl->session)) { - struct json *params; + return false; +} - params = json_array_create_1(json_string_create(idl->lock_name)); - jsonrpc_session_send(idl->session, - jsonrpc_create_request(method, params, idp)); - } +static struct jsonrpc_msg * +ovsdb_idl_db_compose_lock_request__(struct ovsdb_idl_db *db, + const char *method) +{ + ovsdb_idl_db_update_has_lock(db, false); + + json_destroy(db->lock_request_id); + db->lock_request_id = NULL; + + struct json *params = json_array_create_1(json_string_create( + db->lock_name)); + return jsonrpc_create_request(method, params, NULL); } -static void -ovsdb_idl_send_lock_request(struct ovsdb_idl *idl) +static struct jsonrpc_msg * +ovsdb_idl_db_compose_lock_request(struct ovsdb_idl_db *db) { - ovsdb_idl_send_lock_request__(idl, "lock", &idl->lock_request_id); + struct jsonrpc_msg *msg = ovsdb_idl_db_compose_lock_request__(db, "lock"); + db->lock_request_id = json_clone(msg->id); + return msg; } -static void -ovsdb_idl_send_unlock_request(struct ovsdb_idl *idl) +static struct jsonrpc_msg * +ovsdb_idl_db_compose_unlock_request(struct ovsdb_idl_db *db) { - ovsdb_idl_send_lock_request__(idl, "unlock", NULL); + return ovsdb_idl_db_compose_lock_request__(db, "unlock"); } static void -ovsdb_idl_parse_lock_reply(struct ovsdb_idl *idl, const struct json *result) +ovsdb_idl_db_parse_lock_reply(struct ovsdb_idl_db *db, + const struct json *result) { bool got_lock; - json_destroy(idl->lock_request_id); - idl->lock_request_id = NULL; + json_destroy(db->lock_request_id); + db->lock_request_id = NULL; if (result->type == JSON_OBJECT) { const struct json *locked; @@ -4412,30 +4552,32 @@ ovsdb_idl_parse_lock_reply(struct ovsdb_idl *idl, const struct json *result) got_lock = false; } - ovsdb_idl_update_has_lock(idl, got_lock); + ovsdb_idl_db_update_has_lock(db, got_lock); if (!got_lock) { - idl->is_lock_contended = true; + db->is_lock_contended = true; } } -static void -ovsdb_idl_parse_lock_notify(struct ovsdb_idl *idl, - const struct json *params, - bool new_has_lock) +static bool +ovsdb_idl_db_parse_lock_notify(struct ovsdb_idl_db *db, + const struct json *params, + bool new_has_lock) { - if (idl->lock_name + if (db->lock_name && params->type == JSON_ARRAY && json_array(params)->n > 0 && json_array(params)->elems[0]->type == JSON_STRING) { const char *lock_name = json_string(json_array(params)->elems[0]); - if (!strcmp(idl->lock_name, lock_name)) { - ovsdb_idl_update_has_lock(idl, new_has_lock); + if (!strcmp(db->lock_name, lock_name)) { + ovsdb_idl_db_update_has_lock(db, new_has_lock); if (!new_has_lock) { - idl->is_lock_contended = true; + db->is_lock_contended = true; } + return true; } } + return false; } /* Inserts a new Map Operation into current transaction. */ @@ -4469,7 +4611,7 @@ ovsdb_idl_txn_add_map_op(struct ovsdb_idl_row *row, /* Add this row to transaction's list of rows. */ if (hmap_node_is_null(&row->txn_node)) { - hmap_insert(&row->table->idl->txn->txn_rows, &row->txn_node, + hmap_insert(&row->table->db->txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); } } @@ -4505,7 +4647,7 @@ ovsdb_idl_txn_add_set_op(struct ovsdb_idl_row *row, /* Add this row to the transactions's list of rows. */ if (hmap_node_is_null(&row->txn_node)) { - hmap_insert(&row->table->idl->txn->txn_rows, &row->txn_node, + hmap_insert(&row->table->db->txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); } } diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h index 67d48cf0c16b..975f9402b3b4 100644 --- a/lib/ovsdb-idl.h +++ b/lib/ovsdb-idl.h @@ -166,7 +166,7 @@ const struct ovsdb_idl_row *ovsdb_idl_track_get_first( const struct ovsdb_idl_row *ovsdb_idl_track_get_next(const struct ovsdb_idl_row *); bool ovsdb_idl_track_is_updated(const struct ovsdb_idl_row *row, const struct ovsdb_idl_column *column); -void ovsdb_idl_track_clear(const struct ovsdb_idl *); +void ovsdb_idl_track_clear(struct ovsdb_idl *); /* Reading the database replica. */ From patchwork Mon Jan 1 05:16:33 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854290 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z959n1h5Hz9t84 for ; Mon, 1 Jan 2018 16:20:13 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 9D6BACBA; Mon, 1 Jan 2018 05:17:01 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 9EAC2CB6 for ; Mon, 1 Jan 2018 05:17:00 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 2556E14B for ; Mon, 1 Jan 2018 05:17:00 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id EEFF2C5A4F; Mon, 1 Jan 2018 06:16:57 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:33 -0800 Message-Id: <20180101051640.13043-8-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 08/15] ovn-sbctl: Allow retries by default. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Most of the OVS database-manipulation utilities (ovn-sbctl, ovn-nbctl, ovs-vsctl, vtep-ctl) don't retry their connections by default because they assume that the database is either up or down and likely to stay that way. The OVN southbound database, however, is a likely candidate for high availability clustering, so that even if it appears to be down for a moment it will be available again soon. So, prepare for the clustering implementation by enabling retry by default in ovn-sbctl. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- ovn/utilities/ovn-sbctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ovn/utilities/ovn-sbctl.c b/ovn/utilities/ovn-sbctl.c index 1c2e6843f23c..f16cefedd897 100644 --- a/ovn/utilities/ovn-sbctl.c +++ b/ovn/utilities/ovn-sbctl.c @@ -120,7 +120,7 @@ main(int argc, char *argv[]) } /* Initialize IDL. */ - idl = the_idl = ovsdb_idl_create(db, &sbrec_idl_class, false, false); + idl = the_idl = ovsdb_idl_create(db, &sbrec_idl_class, false, true); run_prerequisites(commands, n_commands, idl); /* Execute the commands. From patchwork Mon Jan 1 05:16:34 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854292 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95C04D9Mz9t84 for ; Mon, 1 Jan 2018 16:21:16 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id C41D8CD0; Mon, 1 Jan 2018 05:17:05 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 12A23CCF for ; Mon, 1 Jan 2018 05:17:05 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id B9A2D4EC for ; Mon, 1 Jan 2018 05:17:01 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 8C492C5A51; Mon, 1 Jan 2018 06:16:59 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:34 -0800 Message-Id: <20180101051640.13043-9-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.1 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW, URI_NOVOWEL autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 09/15] ovsdb-server: Add support for a built-in _Server database. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org The _Server database is valuable primarily because it provides database clients a way to find out the details of changes to databases, schemas, etc. in a granular, natural way. Until now, the only way that the server could notify clients about these kinds of changes was to close the session; when the client reconnects, it is expected to reassess the server's state. One way to provide this kind of granular information would be to add specific JSON-RPC requests to obtain notifications for different kinds of changes, but since ovsdb-server already provides granular and flexible notification support for databases, using a database for the purpose is convenient and avoids duplicating functionality. Initially this database only reports databases' names and schemas, but when clustering support is added in a later commit it will also report important aspects of clustering and cluster status. Thus, this database also reduces the need to add JSON-RPC calls to retrieve information about new features. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- Makefile.am | 8 --- NEWS | 2 + build-aux/automake.mk | 12 ++++- build-aux/text2c | 16 ++++++ ovsdb/.gitignore | 3 ++ ovsdb/_server.ovsschema | 9 ++++ ovsdb/_server.xml | 31 ++++++++++++ ovsdb/automake.mk | 26 ++++++++++ ovsdb/ovsdb-server.1.in | 6 +++ ovsdb/ovsdb-server.c | 131 +++++++++++++++++++++++++++++++++++++++++++++--- ovsdb/ovsdb-util.c | 93 +++++++++++++++++++++++++++++++--- ovsdb/ovsdb-util.h | 9 ++++ tests/ovsdb-server.at | 70 +++++++++++++------------- 13 files changed, 359 insertions(+), 57 deletions(-) create mode 100755 build-aux/text2c create mode 100644 ovsdb/_server.ovsschema create mode 100644 ovsdb/_server.xml diff --git a/Makefile.am b/Makefile.am index d7cfdcd52a98..8eef4e3f5640 100644 --- a/Makefile.am +++ b/Makefile.am @@ -82,14 +82,6 @@ EXTRA_DIST = \ .travis/osx-prepare.sh \ appveyor.yml \ boot.sh \ - build-aux/cccl \ - build-aux/cksum-schema-check \ - build-aux/calculate-schema-cksum \ - build-aux/dist-docs \ - build-aux/dpdkstrip.py \ - build-aux/sodepends.py \ - build-aux/soexpand.py \ - build-aux/xml2nroff \ $(MAN_FRAGMENTS) \ $(MAN_ROOTS) \ Vagrantfile \ diff --git a/NEWS b/NEWS index af98c2f80f5a..b697e4968072 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,8 @@ Post-v2.8.0 * New high-level documentation in ovsdb(7). * New file format documentation for developers in ovsdb(5). * Protocol documentation moved from ovsdb-server(1) to ovsdb-server(7). + * ovsdb-server now always hosts a built-in database named _Server. See + ovsdb-server(5) for more details. * ovsdb-client: New "get-schema-cksum" and "query" commands. * ovsdb-client: New "backup" and "restore" commands. * ovsdb-tool: New "db-name" and "schema-name" commands. diff --git a/build-aux/automake.mk b/build-aux/automake.mk index 6baafab0e867..a1f2f856f939 100644 --- a/build-aux/automake.mk +++ b/build-aux/automake.mk @@ -1,4 +1,14 @@ -# This file is purely used for checking the style of the python build tools. +EXTRA_DIST += \ + build-aux/calculate-schema-cksum \ + build-aux/cccl \ + build-aux/cksum-schema-check \ + build-aux/dist-docs \ + build-aux/dpdkstrip.py \ + build-aux/sodepends.py \ + build-aux/soexpand.py \ + build-aux/text2c \ + build-aux/xml2nroff + FLAKE8_PYFILES += \ $(srcdir)/build-aux/xml2nroff \ build-aux/dpdkstrip.py \ diff --git a/build-aux/text2c b/build-aux/text2c new file mode 100755 index 000000000000..cb1f256f1775 --- /dev/null +++ b/build-aux/text2c @@ -0,0 +1,16 @@ +#! /usr/bin/python + +import re +import sys + +"""This utility reads its input, which should be plain text, and +prints it back transformed into quoted strings that may be #included +into C source code.""" + +while True: + line = sys.stdin.readline() + if not line: + break + + s = line.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + print('"' + s + '"') diff --git a/ovsdb/.gitignore b/ovsdb/.gitignore index d715dee925b9..fbcefafc6e97 100644 --- a/ovsdb/.gitignore +++ b/ovsdb/.gitignore @@ -1,3 +1,5 @@ +/_server.ovsschema.inc +/_server.ovsschema.stamp /ovsdb-client /ovsdb-client.1 /ovsdb-doc @@ -5,6 +7,7 @@ /ovsdb-idlc /ovsdb-server /ovsdb-server.1 +/ovsdb-server.5 /ovsdb-tool /ovsdb-tool.1 /libovsdb.pc diff --git a/ovsdb/_server.ovsschema b/ovsdb/_server.ovsschema new file mode 100644 index 000000000000..8997bae5fa36 --- /dev/null +++ b/ovsdb/_server.ovsschema @@ -0,0 +1,9 @@ +{"name": "_Server", + "version": "1.0.0", + "cksum": "3931859656 185", + "tables": { + "Database": { + "columns": { + "name": {"type": "string"}, + "schema": {"type": "string"}}, + "isRoot": true}}} diff --git a/ovsdb/_server.xml b/ovsdb/_server.xml new file mode 100644 index 000000000000..a55beb9bd6de --- /dev/null +++ b/ovsdb/_server.xml @@ -0,0 +1,31 @@ + + +

+ Every ovsdb-server (version 2.9 or later) always hosts an + instance of this schema, which holds information on the status and + configuration of the server itself. This database is read-only. This + manpage describes the schema for this database. +

+ + +

+ This table describes the databases hosted by the database server, with + one row per database. As its database configuration and status changes, + the server automatically and immediately updates the table to match. +

+

+ Clients can use the _uuid column in this table as a + generation number. The server generates a fresh _uuid every + time it adds a database, so that removing and then re-adding a database + to the server causes its row _uuid to change. +

+ + + The database's name, as specified in its schema. + + + + The database schema, as a JSON string. + +
+
diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk index d040fc6de886..c90e2e5b77f9 100644 --- a/ovsdb/automake.mk +++ b/ovsdb/automake.mk @@ -109,3 +109,29 @@ EXTRA_DIST += ovsdb/ovsdb-dot.in ovsdb/dot2pic noinst_SCRIPTS += ovsdb/ovsdb-dot CLEANFILES += ovsdb/ovsdb-dot OVSDB_DOT = $(run_python) $(srcdir)/ovsdb/ovsdb-dot.in + +EXTRA_DIST += ovsdb/_server.ovsschema +CLEANFILES += ovsdb/_server.ovsschema.inc +ovsdb/ovsdb-server.o: ovsdb/_server.ovsschema.inc +ovsdb/_server.ovsschema.inc: ovsdb/_server.ovsschema $(srcdir)/build-aux/text2c + $(AM_V_GEN)$(run_python) $(srcdir)/build-aux/text2c < $< > $@.tmp + $(AM_V_at)mv $@.tmp $@ + +# Version checking for _server.ovsschema. +ALL_LOCAL += ovsdb/_server.ovsschema.stamp +ovsdb/_server.ovsschema.stamp: ovsdb/_server.ovsschema + $(srcdir)/build-aux/cksum-schema-check $? $@ +CLEANFILES += ovsdb/_server.ovsschema.stamp + +# _Server schema documentation +EXTRA_DIST += ovsdb/_server.xml +CLEANFILES += ovsdb/ovsdb-server.5 +man_MANS += ovsdb/ovsdb-server.5 +ovsdb/ovsdb-server.5: \ + ovsdb/ovsdb-doc ovsdb/_server.xml ovsdb/_server.ovsschema \ + $(VSWITCH_PIC) + $(AM_V_GEN)$(OVSDB_DOC) \ + --version=$(VERSION) \ + $(srcdir)/ovsdb/_server.ovsschema \ + $(srcdir)/ovsdb/_server.xml > $@.tmp && \ + mv $@.tmp $@ diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in index 15ff77fd28aa..dfca40d4ef79 100644 --- a/ovsdb/ovsdb-server.1.in +++ b/ovsdb/ovsdb-server.1.in @@ -45,6 +45,12 @@ example, \fBovsdb\-tool create\fR. This OVSDB implementation supports standalone and active-backup databases, as well as database replication. See the Service Models section of \fBovsdb\fR(7) for more information. +.PP +In addition to user-specified databases, \fBovsdb\-server\fR version +2.9 and later also always hosts a built-in database named +\fB_Server\fR. Please see \fBovsdb\-server\fR(5) for documentation on +this database's schema. +. .SH OPTIONS . .IP "\fB\-\-remote=\fIremote\fR" diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 1efb5552da5a..dd0cdfe6a38b 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -65,6 +65,7 @@ struct db { char *filename; struct ovsdb_file *file; struct ovsdb *db; + struct uuid row_uuid; }; /* SSL configuration. */ @@ -107,6 +108,7 @@ static unixctl_cb_func ovsdb_server_remove_database; static unixctl_cb_func ovsdb_server_list_databases; static char *open_db(struct server_config *config, const char *filename); +static void add_server_db(struct server_config *); static void close_db(struct db *db); static void parse_options(int *argc, char **argvp[], @@ -124,6 +126,7 @@ static void report_error_if_changed(char *error, char **last_errorp); static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, const struct sset *remotes, struct shash *all_dbs); +static void update_server_status(struct shash *all_dbs); static void save_config__(FILE *config_file, const struct sset *remotes, const struct sset *db_filenames, @@ -214,6 +217,8 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, update_remote_status(jsonrpc, remotes, all_dbs); } + update_server_status(all_dbs); + memory_wait(); if (*is_backup) { replication_wait(); @@ -328,6 +333,7 @@ main(int argc, char *argv[]) ovs_fatal(0, "%s", error); } } + add_server_db(&server_config); error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); if (!error) { @@ -490,6 +496,16 @@ close_db(struct db *db) free(db); } +static void +add_db(struct server_config *config, const char *name, struct db *db) +{ + db->row_uuid = UUID_ZERO; + shash_add_assert(config->all_dbs, name, db); + bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, + db->db); + ovs_assert(ok); +} + static char * open_db(struct server_config *config, const char *filename) { @@ -524,6 +540,27 @@ open_db(struct server_config *config, const char *filename) return error; } +/* Add the internal _Server database to the server configuration. */ +static void +add_server_db(struct server_config *config) +{ + struct json *schema_json = json_from_string( +#include "ovsdb/_server.ovsschema.inc" + ); + ovs_assert(schema_json->type == JSON_OBJECT); + + struct ovsdb_schema *schema; + struct ovsdb_error *error OVS_UNUSED = ovsdb_schema_from_json(schema_json, + &schema); + ovs_assert(!error); + json_destroy(schema_json); + + struct db *db = xzalloc(sizeof *db); + db->filename = xstrdup(""); + db->db = ovsdb_create(schema); + add_db(config, db->db->schema->name, db); +} + static char * OVS_WARN_UNUSED_RESULT parse_db_column__(const struct shash *all_dbs, const char *name_, char *name, @@ -888,6 +925,18 @@ update_remote_rows(const struct shash *all_dbs, const struct db *db_, } static void +commit_txn(struct ovsdb_txn *txn, const char *name) +{ + struct ovsdb_error *error = ovsdb_txn_commit(txn, false); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + char *msg = ovsdb_error_to_string_free(error); + VLOG_ERR_RL(&rl, "Failed to update %s: %s", name, msg); + free(msg); + } +} + +static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, const struct sset *remotes, struct shash *all_dbs) @@ -903,14 +952,84 @@ update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, update_remote_rows(all_dbs, db, remote, jsonrpc, txn); } - struct ovsdb_error *error = ovsdb_txn_commit(txn, false); - if (error) { - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); - char *msg = ovsdb_error_to_string_free(error); - VLOG_ERR_RL(&rl, "Failed to update remote status: %s", msg); - free(msg); + commit_txn(txn, node->name); + } +} + +/* Updates 'row', a row in the _Server database's Database table, to match + * 'db'. */ +static void +update_database_status(struct ovsdb_row *row, struct db *db) +{ + ovsdb_util_write_string_column(row, "name", db->db->schema->name); + + const struct uuid *row_uuid = ovsdb_row_get_uuid(row); + if (!uuid_equals(row_uuid, &db->row_uuid)) { + db->row_uuid = *row_uuid; + + /* The schema can only change if the row UUID changes, so only update + * it in that case. Presumably, this is worth optimizing because + * schemas are often kilobytes in size and nontrivial to serialize. */ + struct json *json_schema = ovsdb_schema_to_json(db->db->schema); + char *schema = json_to_string(json_schema, JSSF_SORT); + ovsdb_util_write_string_column(row, "schema", schema); + free(schema); + json_destroy(json_schema); + } +} + +/* Updates the Database table in the _Server database. */ +static void +update_server_status(struct shash *all_dbs) +{ + struct db *server_db = shash_find_data(all_dbs, "_Server"); + struct ovsdb_table *database_table = shash_find_data( + &server_db->db->tables, "Database"); + struct ovsdb_txn *txn = ovsdb_txn_create(server_db->db); + + /* Update rows for databases that still exist. + * Delete rows for databases that no longer exist. */ + const struct ovsdb_row *row, *next_row; + HMAP_FOR_EACH_SAFE (row, next_row, hmap_node, &database_table->rows) { + const char *name; + ovsdb_util_read_string_column(row, "name", &name); + struct db *db = shash_find_data(all_dbs, name); + if (!db || !db->db) { + ovsdb_txn_row_delete(txn, row); + } else { + update_database_status(ovsdb_txn_row_modify(txn, row), db); } } + + /* Add rows for new databases. + * + * This is O(n**2) but usually there are only 2 or 3 databases. */ + struct shash_node *node; + SHASH_FOR_EACH (node, all_dbs) { + struct db *db = node->data; + + if (!db->db) { + continue; + } + + HMAP_FOR_EACH (row, hmap_node, &database_table->rows) { + const char *name; + ovsdb_util_read_string_column(row, "name", &name); + if (!strcmp(name, node->name)) { + goto next; + } + } + + /* Add row. */ + struct ovsdb_row *row = ovsdb_row_create(database_table); + uuid_generate(ovsdb_row_get_uuid_rw(row)); + update_database_status(row, db); + ovsdb_txn_row_insert(txn, row); + + next:; + } + + commit_txn(txn, "_Server"); } /* Reconfigures ovsdb-server's remotes based on information in the database. */ diff --git a/ovsdb/ovsdb-util.c b/ovsdb/ovsdb-util.c index 5ee5e4ddaf8d..06d25af49a18 100644 --- a/ovsdb/ovsdb-util.c +++ b/ovsdb/ovsdb-util.c @@ -22,6 +22,38 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_util); +static void +ovsdb_util_clear_column(struct ovsdb_row *row, const char *column_name) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + const struct ovsdb_table_schema *schema = row->table->schema; + const struct ovsdb_column *column; + + column = ovsdb_table_schema_get_column(schema, column_name); + if (!column) { + VLOG_DBG_RL(&rl, "Table `%s' has no `%s' column", + schema->name, column_name); + return; + } + + if (column->type.n_min) { + if (!VLOG_DROP_DBG(&rl)) { + char *type_name = ovsdb_type_to_english(&column->type); + VLOG_DBG("Table `%s' column `%s' has type %s, which requires " + "a value, when it was expected to be optional", + schema->name, column_name, type_name); + free(type_name); + } + return; + } + + struct ovsdb_datum *datum = &row->fields[column->index]; + if (datum->n) { + ovsdb_datum_destroy(datum, &column->type); + ovsdb_datum_init_empty(datum); + } +} + struct ovsdb_datum * ovsdb_util_get_datum(struct ovsdb_row *row, const char *column_name, const enum ovsdb_atomic_type key_type, @@ -164,29 +196,74 @@ ovsdb_util_read_bool_column(const struct ovsdb_row *row, return atom != NULL; } -void -ovsdb_util_write_bool_column(struct ovsdb_row *row, const char *column_name, - bool value) +bool +ovsdb_util_read_uuid_column(const struct ovsdb_row *row, + const char *column_name, struct uuid *uuid) +{ + const union ovsdb_atom *atom; + + atom = ovsdb_util_read_column(row, column_name, OVSDB_TYPE_UUID); + *uuid = atom ? atom->uuid : UUID_ZERO; + return atom != NULL; +} + +static void +ovsdb_util_write_singleton(struct ovsdb_row *row, const char *column_name, + const union ovsdb_atom *atom, + enum ovsdb_atomic_type type) { const struct ovsdb_column *column; struct ovsdb_datum *datum; column = ovsdb_table_schema_get_column(row->table->schema, column_name); - datum = ovsdb_util_get_datum(row, column_name, OVSDB_TYPE_BOOLEAN, - OVSDB_TYPE_VOID, 1); + datum = ovsdb_util_get_datum(row, column_name, type, OVSDB_TYPE_VOID, 1); if (!datum) { return; } - if (datum->n != 1) { + if (datum->n == 1) { + if (ovsdb_atom_equals(&datum->keys[0], atom, type)) { + return; + } + } else { ovsdb_datum_destroy(datum, &column->type); - datum->n = 1; datum->keys = xmalloc(sizeof *datum->keys); datum->values = NULL; } + ovsdb_atom_clone(&datum->keys[0], atom, type); +} - datum->keys[0].boolean = value; +void +ovsdb_util_write_bool_column(struct ovsdb_row *row, const char *column_name, + bool value) +{ + const union ovsdb_atom atom = { .boolean = value }; + ovsdb_util_write_singleton(row, column_name, &atom, OVSDB_TYPE_BOOLEAN); +} + +void +ovsdb_util_write_uuid_column(struct ovsdb_row *row, const char *column_name, + const struct uuid *uuid) +{ + if (uuid) { + const union ovsdb_atom atom = { .uuid = *uuid }; + ovsdb_util_write_singleton(row, column_name, &atom, OVSDB_TYPE_UUID); + } else { + ovsdb_util_clear_column(row, column_name); + } +} + +void +ovsdb_util_write_string_column(struct ovsdb_row *row, const char *column_name, + const char *string) +{ + if (string) { + const union ovsdb_atom atom = { .string = CONST_CAST(char *, string) }; + ovsdb_util_write_singleton(row, column_name, &atom, OVSDB_TYPE_STRING); + } else { + ovsdb_util_clear_column(row, column_name); + } } void diff --git a/ovsdb/ovsdb-util.h b/ovsdb/ovsdb-util.h index abd81ff38cd2..a0404a3a7ff0 100644 --- a/ovsdb/ovsdb-util.h +++ b/ovsdb/ovsdb-util.h @@ -38,6 +38,9 @@ bool ovsdb_util_read_integer_column(const struct ovsdb_row *row, bool ovsdb_util_read_string_column(const struct ovsdb_row *row, const char *column_name, const char **stringp); +void ovsdb_util_write_string_column(struct ovsdb_row *row, + const char *column_name, + const char *string); void ovsdb_util_write_string_string_column(struct ovsdb_row *row, const char *column_name, char **keys, char **values, @@ -48,5 +51,11 @@ bool ovsdb_util_read_bool_column(const struct ovsdb_row *row, void ovsdb_util_write_bool_column(struct ovsdb_row *row, const char *column_name, bool value); +bool ovsdb_util_read_uuid_column(const struct ovsdb_row *row, + const char *column_name, + struct uuid *); +void ovsdb_util_write_uuid_column(struct ovsdb_row *row, + const char *column_name, + const struct uuid *); #endif /* ovsdb/util.h */ diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 968356781604..07ceda92496d 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -137,20 +137,31 @@ AT_CHECK([uuidfilt output], [0], [test ! -e pid || kill `cat pid`]) AT_CLEANUP +dnl CHECK_DBS([databases]) +dnl +dnl Checks that ovsdb-server hosts the given 'databases', each of which +dnl needs to be followed by a newline. +m4_define([CHECK_DBS], + [AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], + [0], [_Server +$1]) +AT_CHECK([ovsdb-client --no-headings dump _Server Database name | sort], [0], [dnl +Database table +_Server +$1])]) + AT_SETUP([database multiplexing implementation]) AT_KEYWORDS([ovsdb server positive]) ordinal_schema > schema1 constraint_schema > schema2 AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db1 db2], [0], [ignore], [ignore]) -AT_CHECK( - [[ovsdb-client list-dbs unix:socket]], - [0], [constraints +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) +CHECK_DBS([constraints ordinals -], [ignore], [test ! -e pid || kill `cat pid`]) +]) AT_CHECK( - [[ovstest test-jsonrpc request unix:socket get_schema [\"nonexistent\"]]], [0], + [[ovstest test-jsonrpc request unix:db.sock get_schema [\"nonexistent\"]]], [0], [[{"error":{"details":"get_schema request specifies unknown database nonexistent","error":"unknown database","syntax":"[\"nonexistent\"]"},"id":0,"result":null} ]], [], [test ! -e pid || kill `cat pid`]) OVSDB_SERVER_SHUTDOWN @@ -165,21 +176,19 @@ AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) # Start ovsdb-server with just a single database - db1. -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db1], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [ordinals +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0]) +CHECK_DBS([ordinals ]) # Add the second database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [constraints +CHECK_DBS([constraints ordinals ]) # The databases are responsive. -AT_CHECK([ovsdb-client list-tables unix:socket constraints], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-client list-tables unix:socket ordinals], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [0], [ignore], [ignore]) # Add an already added database. if test $IS_WIN32 = "yes"; then @@ -205,25 +214,23 @@ ovs-appctl: ovsdb-server: server returned an error AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-remote db:ordinals,ordinals,name], [0]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], [0], [db:ordinals,ordinals,name -punix:socket +punix:db.sock ]) # Removing db1 has no effect on its remote. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db ordinals], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [constraints +CHECK_DBS([constraints ]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], [0], [db:ordinals,ordinals,name -punix:socket +punix:db.sock ]) -AT_CHECK([ovsdb-client list-tables unix:socket ordinals], [1], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [1], [ignore], [ignore]) # Remove db2. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], []) -AT_CHECK([ovsdb-client list-tables unix:socket constraints], [1], [ignore], [ignore]) +CHECK_DBS() +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [1], [ignore], [ignore]) # Remove a non-existent database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db ordinals], [2], @@ -233,10 +240,9 @@ ovs-appctl: ovsdb-server: server returned an error # Add a removed database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [constraints +CHECK_DBS([constraints ]) -AT_CHECK([ovsdb-client list-tables unix:socket constraints], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP @@ -247,14 +253,13 @@ AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db1]) +AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) # Add the second database. constraint_schema > schema2 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], [0]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [constraints +CHECK_DBS([constraints ordinals ]) @@ -266,8 +271,7 @@ OVS_WAIT_WHILE([kill -0 `cat old.pid`]) OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [constraints +CHECK_DBS([constraints ordinals ]) OVS_APP_EXIT_AND_WAIT([ovsdb-server]) @@ -282,12 +286,11 @@ AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) constraint_schema > schema2 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db1 db2]) +AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2]) # Remove the second database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [ordinals +CHECK_DBS([ordinals ]) # Kill the daemon process, making it look like a segfault, @@ -298,8 +301,7 @@ OVS_WAIT_WHILE([kill -0 `cat old.pid`]) OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) -AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-dbs], - [0], [ordinals +CHECK_DBS([ordinals ]) OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP From patchwork Mon Jan 1 05:16:35 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854293 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95CX58fYz9t84 for ; Mon, 1 Jan 2018 16:21:44 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id C2374CE4; Mon, 1 Jan 2018 05:17:07 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 3F576CDB for ; Mon, 1 Jan 2018 05:17:06 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 5439A14B for ; Mon, 1 Jan 2018 05:17:03 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 2F80FC5A44; Mon, 1 Jan 2018 06:17:00 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:35 -0800 Message-Id: <20180101051640.13043-10-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 10/15] ovsdb-server: Add new RPC "set_db_change_aware". X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org The _Server database recently added to ovsdb-server can be used to dump out information about databases, but monitoring updates to _Server is not yet very useful because for historical reasons ovsdb-server drops all of its OVSDB connections whenever databases are added or removed or otherwise change in some major way. It is not a good idea to change this behavior for all clients, because some of them rely on it, but this commit introduces a new RPC that allows clients that understand _Server to suppress the connection-closing behavior. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- Documentation/ref/ovsdb-server.7.rst | 43 ++++++++ ovsdb/_server.xml | 34 ++++++ ovsdb/jsonrpc-server.c | 195 +++++++++++++++++++++++++++-------- ovsdb/jsonrpc-server.h | 4 +- ovsdb/ovsdb-client.c | 28 ++++- ovsdb/ovsdb-server.c | 11 +- tests/ovsdb-server.at | 40 ++++++- 7 files changed, 299 insertions(+), 56 deletions(-) diff --git a/Documentation/ref/ovsdb-server.7.rst b/Documentation/ref/ovsdb-server.7.rst index cc625f6016fb..2ed392feed1f 100644 --- a/Documentation/ref/ovsdb-server.7.rst +++ b/Documentation/ref/ovsdb-server.7.rst @@ -146,6 +146,19 @@ notifications (see below) to the request, it must be unique among all active monitors. ``ovsdb-server`` rejects attempt to create two monitors with the same identifier. +4.1.7 Monitor Cancellation +-------------------------- + +When a database monitored by a session is removed, and database change +awareness is enabled for the session (see Section 4.1.16), the database server +spontaneously cancels all monitors (including conditional monitors described in +Section 4.1.12) for the removed database. For each canceled monitor, it issues +a notification in the following form:: + + "method": "monitor_canceled" + "params": [] + "id": null + 4.1.12 Monitor_cond ------------------- @@ -371,6 +384,36 @@ The response object contains the following members:: running OVSDB server process. A fresh UUID is generated when the process restarts. +4.1.16 Database Change Awareness +-------------------------------- + +RFC 7047 does not provide a way for a client to find out about some kinds of +configuration changes, such as about databases added or removed while a client +is connected to the server, or databases changing between read/write and +read-only due to a transition between active and backup roles. Traditionally, +``ovsdb-server`` disconnects all of its clients when this happens, because this +prompts a well-written client to reassess what is available from the server +when it reconnects. + +OVS 2.9 provides a way for clients to keep track of these kinds of changes, by +monitoring the ``Database`` table in the ``_Server`` database introduced in +this release (see ``ovsdb-server(5)`` for details). By itself, this does not +suppress ``ovsdb-server`` disconnection behavior, because a client might +monitor this database without understanding its special semantics. Instead, +``ovsdb-server`` provides a special request:: + + "method": "set_db_change_aware" + "params": [] + "id": + +If the boolean in the request is true, it suppresses the connection-closing +behavior for the current connection, and false restores the default behavior. +The reply is always the same:: + + "result": {} + "error": null + "id": same "id" as request + 5.1 Notation ------------ diff --git a/ovsdb/_server.xml b/ovsdb/_server.xml index a55beb9bd6de..8ef782fb97b2 100644 --- a/ovsdb/_server.xml +++ b/ovsdb/_server.xml @@ -13,6 +13,40 @@ one row per database. As its database configuration and status changes, the server automatically and immediately updates the table to match.

+ +

+ The OVSDB protocol specified in RFC 7047 does not provide a way for an + OVSDB client to find out about some kinds of configuration changes, such + as about databases added or removed while a client is connected to the + server, or databases changing between read/write and read-only due to a + transition between active and backup roles. This table provides a + solution: clients can monitor the table's contents to find out about + important changes. +

+ +

+ Traditionally, ovsdb-server disconnects all of its clients + when a significant configuration change occurs, because this prompts a + well-written client to reassess what is available from the server when it + reconnects. Because this table provides an alternative and more + efficient way to find out about those changes, OVS 2.9 also introduces + the set_db_change_aware RPC, documented in + ovsdb-server(1), to allow clients to suppress this + disconnection behavior. +

+ +

+ When a database is removed from the server, in addition to + Database table updates, the server sends cancel + messages, as described in RFC 7047 section 4.1.4, in reply to outstanding + transactions for the removed database. The server also cancels any + outstanding monitoring initiated by monitor or + monitor_cond requested on the removed database, sending the + monitor_canceled RPC described in + ovsdb-server(5). Only clients that disable disconnection + with set_db_change_aware receive these messages. +

+

Clients can use the _uuid column in this table as a generation number. The server generates a fresh _uuid every diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 27586cddd8b3..d51a56854517 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -57,12 +57,15 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); /* Sessions. */ static struct ovsdb_jsonrpc_session *ovsdb_jsonrpc_session_create( struct ovsdb_jsonrpc_remote *, struct jsonrpc_session *, bool); +static void ovsdb_jsonrpc_session_preremove_db(struct ovsdb_jsonrpc_remote *, + struct ovsdb *); static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *); static void ovsdb_jsonrpc_session_wait_all(struct ovsdb_jsonrpc_remote *); static void ovsdb_jsonrpc_session_get_memory_usage_all( const struct ovsdb_jsonrpc_remote *, struct simap *usage); static void ovsdb_jsonrpc_session_close_all(struct ovsdb_jsonrpc_remote *); -static void ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *); +static void ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *, + bool force); static void ovsdb_jsonrpc_session_set_all_options( struct ovsdb_jsonrpc_remote *, const struct ovsdb_jsonrpc_options *); static bool ovsdb_jsonrpc_active_session_get_status( @@ -83,6 +86,8 @@ static void ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *, static struct ovsdb_jsonrpc_trigger *ovsdb_jsonrpc_trigger_find( struct ovsdb_jsonrpc_session *, const struct json *id, size_t hash); static void ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *); +static void ovsdb_jsonrpc_trigger_preremove_db(struct ovsdb_jsonrpc_session *, + struct ovsdb *); static void ovsdb_jsonrpc_trigger_complete_all(struct ovsdb_jsonrpc_session *); static void ovsdb_jsonrpc_trigger_complete_done( struct ovsdb_jsonrpc_session *); @@ -99,6 +104,8 @@ static struct jsonrpc_msg *ovsdb_jsonrpc_monitor_cancel( struct ovsdb_jsonrpc_session *, struct json_array *params, const struct json *request_id); +static void ovsdb_jsonrpc_monitor_preremove_db(struct ovsdb_jsonrpc_session *, + struct ovsdb *); static void ovsdb_jsonrpc_monitor_remove_all(struct ovsdb_jsonrpc_session *); static void ovsdb_jsonrpc_monitor_flush_all(struct ovsdb_jsonrpc_session *); static bool ovsdb_jsonrpc_monitor_needs_flush(struct ovsdb_jsonrpc_session *); @@ -157,34 +164,25 @@ ovsdb_jsonrpc_server_create(bool read_only) bool ovsdb_jsonrpc_server_add_db(struct ovsdb_jsonrpc_server *svr, struct ovsdb *db) { - /* The OVSDB protocol doesn't have a way to notify a client that a - * database has been added. If some client tried to use the database - * that we're adding and failed, then forcing it to reconnect seems like - * a reasonable way to make it try again. - * - * If this is too big of a hammer in practice, we could be more selective, - * e.g. disconnect only connections that actually tried to use a database - * with 'db''s name. */ - ovsdb_jsonrpc_server_reconnect(svr); - + ovsdb_jsonrpc_server_reconnect(svr, false); return ovsdb_server_add_db(&svr->up, db); } -/* Removes 'db' from the set of databases served out by 'svr'. Returns - * true if successful, false if there is no database associated with 'db'. */ -bool +/* Removes 'db' from the set of databases served out by 'svr'. */ +void ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *svr, struct ovsdb *db) { - /* There might be pointers to 'db' from 'svr', such as monitors or - * outstanding transactions. Disconnect all JSON-RPC connections to avoid - * accesses to freed memory. - * - * If this is too big of a hammer in practice, we could be more selective, - * e.g. disconnect only connections that actually reference 'db'. */ - ovsdb_jsonrpc_server_reconnect(svr); + struct shash_node *node; + SHASH_FOR_EACH (node, &svr->remotes) { + struct ovsdb_jsonrpc_remote *remote = node->data; + + ovsdb_jsonrpc_session_preremove_db(remote, db); + } + + ovsdb_jsonrpc_server_reconnect(svr, false); - return ovsdb_server_remove_db(&svr->up, db); + ovsdb_server_remove_db(&svr->up, db); } void @@ -333,17 +331,20 @@ ovsdb_jsonrpc_server_free_remote_status( free(status->locks_lost); } -/* Forces all of the JSON-RPC sessions managed by 'svr' to disconnect and - * reconnect. */ +/* Makes all of the JSON-RPC sessions managed by 'svr' to disconnect. (They + * will then generally reconnect.). + * + * If 'force' is true, disconnects all sessions. Otherwise, disconnects only + * sesions that aren't database change aware. */ void -ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr) +ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr, bool force) { struct shash_node *node; SHASH_FOR_EACH (node, &svr->remotes) { struct ovsdb_jsonrpc_remote *remote = node->data; - ovsdb_jsonrpc_session_reconnect_all(remote); + ovsdb_jsonrpc_session_reconnect_all(remote, force); } } @@ -359,7 +360,7 @@ ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *svr, { if (svr->read_only != read_only) { svr->read_only = read_only; - ovsdb_jsonrpc_server_reconnect(svr); + ovsdb_jsonrpc_server_reconnect(svr, false); } } @@ -432,6 +433,20 @@ struct ovsdb_jsonrpc_session { struct ovsdb_session up; struct ovsdb_jsonrpc_remote *remote; + /* RFC 7047 does not contemplate how to alert clients to changes to the set + * of databases, e.g. databases that are added or removed while the + * database server is running. Traditionally, ovsdb-server disconnects all + * of its clients when this happens; a well-written client will reassess + * what is available from the server upon reconnection. + * + * OVS 2.9 introduces a way for clients to monitor changes to the databases + * being served, through the Database table in the _Server database that + * OVSDB adds in this version. ovsdb-server suppresses the connection + * close for clients that identify themselves as taking advantage of this + * mechanism. + */ + bool db_change_aware; + /* Triggers. */ struct hmap triggers; /* Hmap of "struct ovsdb_jsonrpc_trigger"s. */ @@ -478,6 +493,20 @@ ovsdb_jsonrpc_session_create(struct ovsdb_jsonrpc_remote *remote, return s; } +/* Database 'db' is about to be removed from the database server. To prepare, + * this function removes all references to 'db' from session 's'. */ +static void +ovsdb_jsonrpc_session_preremove_db(struct ovsdb_jsonrpc_remote *remote, + struct ovsdb *db) +{ + struct ovsdb_jsonrpc_session *s; + + LIST_FOR_EACH (s, node, &remote->sessions) { + ovsdb_jsonrpc_monitor_preremove_db(s, db); + ovsdb_jsonrpc_trigger_preremove_db(s, db); + } +} + static void ovsdb_jsonrpc_session_close(struct ovsdb_jsonrpc_session *s) { @@ -606,17 +635,23 @@ ovsdb_jsonrpc_session_close_all(struct ovsdb_jsonrpc_remote *remote) } } -/* Forces all of the JSON-RPC sessions managed by 'remote' to disconnect and - * reconnect. */ +/* Makes all of the JSON-RPC sessions managed by 'remove' to disconnect. (They + * will then generally reconnect.). + * + * If 'force' is true, disconnects all sessions. Otherwise, disconnects only + * sesions that aren't database change aware. */ static void -ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *remote) +ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *remote, + bool force) { struct ovsdb_jsonrpc_session *s, *next; LIST_FOR_EACH_SAFE (s, next, node, &remote->sessions) { - jsonrpc_session_force_reconnect(s->js); - if (!jsonrpc_session_is_alive(s->js)) { - ovsdb_jsonrpc_session_close(s); + if (force || !s->db_change_aware) { + jsonrpc_session_force_reconnect(s->js); + if (!jsonrpc_session_is_alive(s->js)) { + ovsdb_jsonrpc_session_close(s); + } } } } @@ -859,6 +894,17 @@ ovsdb_jsonrpc_session_unlock__(struct ovsdb_lock_waiter *waiter) } static struct jsonrpc_msg * +syntax_error_reply(const struct jsonrpc_msg *request, const char *details) +{ + struct ovsdb_error *error = ovsdb_syntax_error( + request->params, NULL, "%s: %s", request->method, details); + struct jsonrpc_msg *msg = jsonrpc_create_error(ovsdb_error_to_json(error), + request->id); + ovsdb_error_destroy(error); + return msg; +} + +static struct jsonrpc_msg * ovsdb_jsonrpc_session_unlock(struct ovsdb_jsonrpc_session *s, struct jsonrpc_msg *request) { @@ -872,24 +918,21 @@ ovsdb_jsonrpc_session_unlock(struct ovsdb_jsonrpc_session *s, error = ovsdb_jsonrpc_session_parse_lock_name(request, &lock_name); if (error) { - goto error; + return jsonrpc_create_error(ovsdb_error_to_json_free(error), + request->id); } /* Report error if this session has not issued a "lock" or "steal" for this * lock. */ waiter = ovsdb_session_get_lock_waiter(&s->up, lock_name); if (!waiter) { - error = ovsdb_syntax_error( - request->params, NULL, "\"unlock\" without \"lock\" or \"steal\""); - goto error; + return syntax_error_reply(request, + "\"unlock\" without \"lock\" or \"steal\""); } ovsdb_jsonrpc_session_unlock__(waiter); return jsonrpc_create_reply(json_object_create(), request->id); - -error: - return jsonrpc_create_error(ovsdb_error_to_json_free(error), request->id); } static struct jsonrpc_msg * @@ -903,6 +946,21 @@ execute_transaction(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, return NULL; } +static struct jsonrpc_msg * +ovsdb_jsonrpc_session_set_db_change_aware(struct ovsdb_jsonrpc_session *s, + const struct jsonrpc_msg *request) +{ + const struct json_array *params = json_array(request->params); + if (params->n != 1 + || (params->elems[0]->type != JSON_TRUE && + params->elems[0]->type != JSON_FALSE)) { + return syntax_error_reply(request, "true or false parameter expected"); + } + + s->db_change_aware = json_boolean(params->elems[0]); + return jsonrpc_create_reply(json_object_create(), request->id); +} + static void ovsdb_jsonrpc_session_got_request(struct ovsdb_jsonrpc_session *s, struct jsonrpc_msg *request) @@ -963,6 +1021,8 @@ ovsdb_jsonrpc_session_got_request(struct ovsdb_jsonrpc_session *s, reply = ovsdb_jsonrpc_session_lock(s, request, OVSDB_LOCK_STEAL); } else if (!strcmp(request->method, "unlock")) { reply = ovsdb_jsonrpc_session_unlock(s, request); + } else if (!strcmp(request->method, "set_db_change_aware")) { + reply = ovsdb_jsonrpc_session_set_db_change_aware(s, request); } else if (!strcmp(request->method, "echo")) { reply = jsonrpc_create_reply(json_clone(request->params), request->id); } else { @@ -1098,14 +1158,34 @@ ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *t) } static void -ovsdb_jsonrpc_trigger_complete_all(struct ovsdb_jsonrpc_session *s) +ovsdb_jsonrpc_trigger_remove__(struct ovsdb_jsonrpc_session *s, + struct ovsdb *db) { struct ovsdb_jsonrpc_trigger *t, *next; HMAP_FOR_EACH_SAFE (t, next, hmap_node, &s->triggers) { - ovsdb_jsonrpc_trigger_complete(t); + if (!db || t->trigger.db == db) { + ovsdb_jsonrpc_trigger_complete(t); + } } } +/* Database 'db' is about to be removed from the database server. To prepare, + * this function removes all references from triggers in 's' to 'db'. */ +static void +ovsdb_jsonrpc_trigger_preremove_db(struct ovsdb_jsonrpc_session *s, + struct ovsdb *db) +{ + ovs_assert(db); + ovsdb_jsonrpc_trigger_remove__(s, db); +} + +/* Removes all triggers from 's'. */ +static void +ovsdb_jsonrpc_trigger_complete_all(struct ovsdb_jsonrpc_session *s) +{ + ovsdb_jsonrpc_trigger_remove__(s, NULL); +} + static void ovsdb_jsonrpc_trigger_complete_done(struct ovsdb_jsonrpc_session *s) { @@ -1525,15 +1605,42 @@ ovsdb_jsonrpc_monitor_cancel(struct ovsdb_jsonrpc_session *s, } static void -ovsdb_jsonrpc_monitor_remove_all(struct ovsdb_jsonrpc_session *s) +ovsdb_jsonrpc_monitor_remove__(struct ovsdb_jsonrpc_session *s, + struct ovsdb *db) { struct ovsdb_jsonrpc_monitor *m, *next; HMAP_FOR_EACH_SAFE (m, next, node, &s->monitors) { - ovsdb_jsonrpc_monitor_destroy(m); + if (!db || m->db == db) { + if (db && jsonrpc_session_is_connected(s->js) + && s->db_change_aware) { + struct jsonrpc_msg *notify = jsonrpc_create_notify( + "monitor_canceled", + json_array_create_1(json_clone(m->monitor_id))); + ovsdb_jsonrpc_session_send(s, notify); + } + ovsdb_jsonrpc_monitor_destroy(m); + } } } +/* Database 'db' is about to be removed from the database server. To prepare, + * this function removes all references from monitors in 's' to 'db'. */ +static void +ovsdb_jsonrpc_monitor_preremove_db(struct ovsdb_jsonrpc_session *s, + struct ovsdb *db) +{ + ovs_assert(db); + ovsdb_jsonrpc_monitor_remove__(s, db); +} + +/* Cancels all monitors in 's'. */ +static void +ovsdb_jsonrpc_monitor_remove_all(struct ovsdb_jsonrpc_session *s) +{ + ovsdb_jsonrpc_monitor_remove__(s, NULL); +} + static struct json * ovsdb_jsonrpc_monitor_compose_update(struct ovsdb_jsonrpc_monitor *m, bool initial) diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index a3acc75f8d4f..50a8b879c5a9 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -27,7 +27,7 @@ struct uuid; struct ovsdb_jsonrpc_server *ovsdb_jsonrpc_server_create(bool read_only); bool ovsdb_jsonrpc_server_add_db(struct ovsdb_jsonrpc_server *, struct ovsdb *); -bool ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *, +void ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *, struct ovsdb *); void ovsdb_jsonrpc_server_destroy(struct ovsdb_jsonrpc_server *); @@ -64,7 +64,7 @@ bool ovsdb_jsonrpc_server_get_remote_status( void ovsdb_jsonrpc_server_free_remote_status( struct ovsdb_jsonrpc_remote_status *); -void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *); +void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool force); void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *); void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *); diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index 0ab4d66f1b29..600c5070db78 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -74,6 +74,13 @@ struct ovsdb_client_command { /* --timestamp: Print a timestamp before each update on "monitor" command? */ static bool timestamp; +/* --db-change-aware, --no-db-change-aware: Enable db_change_aware feature for + * "monitor" command? + * + * (This option is undocumented because it is expected to be useful only for + * testing that the db_change_aware feature actually works.) */ +static int db_change_aware; + /* --force: Ignore schema differences for "restore" command? */ static bool force; @@ -199,6 +206,8 @@ parse_options(int argc, char *argv[]) {"version", no_argument, NULL, 'V'}, {"timestamp", no_argument, NULL, OPT_TIMESTAMP}, {"force", no_argument, NULL, OPT_FORCE}, + {"db-change-aware", no_argument, &db_change_aware, 1}, + {"no-db-change-aware", no_argument, &db_change_aware, 0}, VLOG_LONG_OPTIONS, DAEMON_LONG_OPTIONS, #ifdef HAVE_OPENSSL @@ -1021,7 +1030,6 @@ do_monitor__(struct jsonrpc *rpc, const char *database, const char *table_name = argv[0]; struct unixctl_server *unixctl; struct ovsdb_schema *schema; - struct jsonrpc_msg *request; struct json *monitor, *monitor_requests, *request_id; bool exiting = false; bool blocked = false; @@ -1089,11 +1097,29 @@ do_monitor__(struct jsonrpc *rpc, const char *database, free(nodes); } + if (db_change_aware) { + struct jsonrpc_msg *request = jsonrpc_create_request( + "set_db_change_aware", + json_array_create_1(json_boolean_create(true)), + NULL); + struct jsonrpc_msg *reply; + int error = jsonrpc_transact_block(rpc, request, &reply); + if (error) { + ovs_fatal(error, "%s: error setting db_change_aware", server); + } + if (reply->type == JSONRPC_ERROR) { + ovs_fatal(0, "%s: set_db_change_aware failed (%s)", + server, json_to_string(reply->error, 0)); + } + jsonrpc_msg_destroy(reply); + } + monitor = json_array_create_3(json_string_create(database), json_null_create(), monitor_requests); const char *method = version == OVSDB_MONITOR_V2 ? "monitor_cond" : "monitor"; + struct jsonrpc_msg *request; request = jsonrpc_create_request(method, monitor, NULL); request_id = json_clone(request->id); jsonrpc_send(rpc, request); diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index dd0cdfe6a38b..1e36b27958f8 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1240,7 +1240,7 @@ ovsdb_server_disable_monitor_cond(struct unixctl_conn *conn, struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; ovsdb_jsonrpc_disable_monitor_cond(); - ovsdb_jsonrpc_server_reconnect(jsonrpc); + ovsdb_jsonrpc_server_reconnect(jsonrpc, true); unixctl_command_reply(conn, NULL); } @@ -1298,7 +1298,7 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *jsonrpc_) { struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; - ovsdb_jsonrpc_server_reconnect(jsonrpc); + ovsdb_jsonrpc_server_reconnect(jsonrpc, true); unixctl_command_reply(conn, NULL); } @@ -1400,12 +1400,9 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, static void remove_db(struct server_config *config, struct shash_node *node) { - struct db *db; - bool ok; + struct db *db = node->data; - db = node->data; - ok = ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db); - ovs_assert(ok); + ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db); close_db(db); shash_delete(config->all_dbs, node); diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 07ceda92496d..2e3d8ad14636 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -169,14 +169,31 @@ AT_CLEANUP AT_SETUP([ovsdb-server/add-db and remove-db]) AT_KEYWORDS([ovsdb server positive]) -on_exit 'kill `cat ovsdb-server.pid`' +on_exit 'kill `cat *.pid`' ordinal_schema > schema1 constraint_schema > schema2 AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) # Start ovsdb-server with just a single database - db1. -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0]) +CHECK_DBS([ordinals +]) + +# Remove the database. +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db ordinals], [0]) +CHECK_DBS([]) + +# Start monitoring processes. +AT_CHECK([ovsdb-client --detach --pidfile=ovsdb-client-1.pid --no-db-change-aware --no-headings monitor _Server Database name > db-change-unaware.stdout 2> db-change-unaware.stderr]) +AT_CHECK([ovsdb-client --detach --pidfile=ovsdb-client-2.pid --db-change-aware --no-headings monitor _Server Database name > db-change-aware.stdout 2> db-change-aware.stderr]) +AT_CAPTURE_FILE([db-change-unaware.stdout]) +AT_CAPTURE_FILE([db-change-unaware.stderr]) +AT_CAPTURE_FILE([db-change-aware.stdout]) +AT_CAPTURE_FILE([db-change-aware.stderr]) + +# Add the first database back. +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db1], [0]) CHECK_DBS([ordinals ]) @@ -243,6 +260,25 @@ AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], [0]) CHECK_DBS([constraints ]) AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) + +# Check the monitoring results. +AT_CHECK([uuidfilt db-change-aware.stdout], [0], [dnl +<0> initial _Server + +<1> insert ordinals + +<2> insert constraints + +<1> delete ordinals + +<2> delete constraints + +<3> insert constraints +]) +AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl +<0> initial _Server +]) + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP From patchwork Mon Jan 1 05:16:36 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854296 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95F83fBhz9t84 for ; Mon, 1 Jan 2018 16:23:08 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 21A8DCFB; Mon, 1 Jan 2018 05:17:13 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id AF96FCE0 for ; Mon, 1 Jan 2018 05:17:09 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 53EAB14D for ; Mon, 1 Jan 2018 05:17:05 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id A8D8FC5A46; Mon, 1 Jan 2018 06:17:02 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:36 -0800 Message-Id: <20180101051640.13043-11-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 11/15] ovsdb: Add support for online schema conversion. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org With this change, "ovsdb-client convert" can be used to convert a database from one schema to another without taking the database offline. This can be useful to minimize downtime for a database during a software upgrade. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- Documentation/ref/ovsdb-server.7.rst | 35 +++++ Documentation/ref/ovsdb.7.rst | 11 +- NEWS | 2 + lib/ovsdb-data.c | 13 ++ lib/ovsdb-data.h | 6 + ovsdb/file.c | 135 +++++++++++++++++- ovsdb/file.h | 4 + ovsdb/jsonrpc-server.c | 120 +++++++--------- ovsdb/jsonrpc-server.h | 3 +- ovsdb/monitor.c | 19 ++- ovsdb/monitor.h | 2 + ovsdb/ovsdb-client.1.in | 41 +++++- ovsdb/ovsdb-client.c | 95 ++++++++++--- ovsdb/ovsdb-server.c | 56 ++++---- ovsdb/ovsdb.c | 26 +++- ovsdb/ovsdb.h | 3 +- ovsdb/transaction.c | 37 +++-- ovsdb/transaction.h | 5 + ovsdb/trigger.c | 142 +++++++++++++++---- ovsdb/trigger.h | 17 +-- tests/ovsdb-monitor.at | 4 +- tests/ovsdb-server.at | 263 +++++++++++++++++++++++++++++++++++ tests/test-ovsdb.c | 14 +- 23 files changed, 870 insertions(+), 183 deletions(-) diff --git a/Documentation/ref/ovsdb-server.7.rst b/Documentation/ref/ovsdb-server.7.rst index 2ed392feed1f..e3a8ccc61399 100644 --- a/Documentation/ref/ovsdb-server.7.rst +++ b/Documentation/ref/ovsdb-server.7.rst @@ -414,6 +414,41 @@ The reply is always the same:: "error": null "id": same "id" as request +4.1.17 Schema Conversion +------------------------ + +Open vSwitch 2.9 adds a new JSON-RPC request to convert an online database from +one schema to another. The request contains the following members:: + + "method": "convert" + "params": [] + "id": + +Upon receipt, the server converts the database named in to +that schema. The conversion is atomic, consistent, isolated, and durable. The +data in the database must be valid when interpreted under , +with only one exception: data for tables and columns that do not exist in the +new schema are ignored. Columns that exist in but not in the +database are set to their default values. All of the new schema's constraints +apply in full. + +If the conversion is successful, the server notifies clients that use the +``set_db_change_aware`` RPC introduced in Open vSwitch 2.9 and cancels their +outstanding transactions and monitors. The server disconnects other clients, +enabling them to notice the change when they reconnect. The server sends the +following reply:: + + "result": {} + "error": null + "id": same "id" as request + +If the conversion fails, then the server sends an error reply in the following +form:: + + "result": null + "error": [] + "id": same "id" as request + 5.1 Notation ------------ diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 25a6e5fc36ed..6adef73826e8 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -367,10 +367,17 @@ active-backup database, first stop the database server or servers, then use ``ovsdb-tool convert`` to convert it to the new schema, and then restart the database server. +OVSDB also supports online database schema conversion. +To convert a database online, use ``ovsdb-client convert``. +The conversion is atomic, consistent, isolated, and durable. ``ovsdb-server`` +disconnects any clients connected when the conversion takes place (except +clients that use the ``set_db_change_aware`` Open vSwitch extension RPC). Upon +reconnection, clients will discover that the schema has changed. + Schema versions and checksums (see Schemas_ above) can give hints about whether a database needs to be converted to a new schema. If there is any question, -though, the ``needs-conversion`` command on ``ovsdb-tool`` can provide a -definitive answer. +though, the ``needs-conversion`` command on ``ovsdb-tool`` and ``ovsdb-client`` +can provide a definitive answer. Working with Database History ----------------------------- diff --git a/NEWS b/NEWS index b697e4968072..dfc2fb7728a4 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,8 @@ Post-v2.8.0 * New high-level documentation in ovsdb(7). * New file format documentation for developers in ovsdb(5). * Protocol documentation moved from ovsdb-server(1) to ovsdb-server(7). + * ovsdb-server now supports online schema conversion via + "ovsdb-client convert". * ovsdb-server now always hosts a built-in database named _Server. See ovsdb-server(5) for more details. * ovsdb-client: New "get-schema-cksum" and "query" commands. diff --git a/lib/ovsdb-data.c b/lib/ovsdb-data.c index 87d8effd1d67..69122dc10432 100644 --- a/lib/ovsdb-data.c +++ b/lib/ovsdb-data.c @@ -1684,6 +1684,19 @@ ovsdb_datum_from_smap(struct ovsdb_datum *datum, const struct smap *smap) ovsdb_datum_sort_unique(datum, OVSDB_TYPE_STRING, OVSDB_TYPE_STRING); } +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_datum_convert(struct ovsdb_datum *dst, + const struct ovsdb_type *dst_type, + const struct ovsdb_datum *src, + const struct ovsdb_type *src_type) +{ + struct json *json = ovsdb_datum_to_json(src, src_type); + struct ovsdb_error *error = ovsdb_datum_from_json(dst, dst_type, json, + NULL); + json_destroy(json); + return error; +} + static uint32_t hash_atoms(enum ovsdb_atomic_type type, const union ovsdb_atom *atoms, unsigned int n, uint32_t basis) diff --git a/lib/ovsdb-data.h b/lib/ovsdb-data.h index c842fe28fc1a..c5a80ee39fd6 100644 --- a/lib/ovsdb-data.h +++ b/lib/ovsdb-data.h @@ -192,6 +192,12 @@ void ovsdb_datum_to_bare(const struct ovsdb_datum *, void ovsdb_datum_from_smap(struct ovsdb_datum *, const struct smap *); +struct ovsdb_error *ovsdb_datum_convert(struct ovsdb_datum *dst, + const struct ovsdb_type *dst_type, + const struct ovsdb_datum *src, + const struct ovsdb_type *src_type) + OVS_WARN_UNUSED_RESULT; + /* Comparison. */ uint32_t ovsdb_datum_hash(const struct ovsdb_datum *, const struct ovsdb_type *, uint32_t basis); diff --git a/ovsdb/file.c b/ovsdb/file.c index 4aafb3be8ab4..dadb988d3088 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -566,22 +566,31 @@ ovsdb_file_txn_annotate(struct json *json, const char *comment) return json; } -struct ovsdb_error * -ovsdb_file_commit(struct ovsdb_file *file, - const struct ovsdb_txn *txn, bool durable) +/* Returns 'txn' transformed into the JSON format that is used in OVSDB files. + * (But the caller must use ovsdb_file_txn_annotate() to add the _comment the + * _date members.) If 'txn' doesn't actually change anything, returns NULL */ +static struct json * +ovsdb_file_txn_to_json(const struct ovsdb_txn *txn) { struct ovsdb_file_txn ftxn; - struct ovsdb_error *error; ovsdb_file_txn_init(&ftxn); ovsdb_txn_for_each_change(txn, ovsdb_file_change_cb, &ftxn); - if (!ftxn.json) { + return ftxn.json; +} + +struct ovsdb_error * +ovsdb_file_commit(struct ovsdb_file *file, + const struct ovsdb_txn *txn, bool durable) +{ + struct json *txn_json = ovsdb_file_txn_to_json(txn); + if (!txn_json) { /* Nothing to commit. */ return NULL; } - error = ovsdb_file_txn_commit(ftxn.json, ovsdb_txn_get_comment(txn), - durable, file->log); + struct ovsdb_error *error = ovsdb_file_txn_commit( + txn_json, ovsdb_txn_get_comment(txn), durable, file->log); if (error) { return error; } @@ -844,3 +853,115 @@ ovsdb_file_txn_commit(struct json *json, const char *comment, return NULL; } + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_convert_table(struct ovsdb_txn *txn, + const struct ovsdb_table *src_table, + struct ovsdb_table *dst_table) +{ + const struct ovsdb_row *src_row; + HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { + struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); + *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); + + struct shash_node *node; + SHASH_FOR_EACH (node, &src_table->schema->columns) { + const struct ovsdb_column *src_column = node->data; + if (src_column->index == OVSDB_COL_UUID || + src_column->index == OVSDB_COL_VERSION) { + continue; + } + + const struct ovsdb_column *dst_column + = shash_find_data(&dst_table->schema->columns, + src_column->name); + if (!dst_column) { + continue; + } + + struct ovsdb_error *error = ovsdb_datum_convert( + &dst_row->fields[dst_column->index], &dst_column->type, + &src_row->fields[src_column->index], &src_column->type); + if (error) { + ovsdb_row_destroy(dst_row); + return error; + } + } + + ovsdb_txn_row_insert(txn, dst_row); + } + return NULL; +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_file_convert(const struct ovsdb_file *file, + const struct ovsdb_schema *new_schema) +{ + struct ovsdb *new_db = ovsdb_create(ovsdb_schema_clone(new_schema)); + struct ovsdb_txn *txn = ovsdb_txn_create(new_db); + struct ovsdb_error *error = NULL; + + struct shash_node *node; + SHASH_FOR_EACH (node, &file->db->tables) { + const char *table_name = node->name; + const struct ovsdb_table *src_table = node->data; + struct ovsdb_table *dst_table = shash_find_data(&new_db->tables, + table_name); + if (!dst_table) { + continue; + } + + error = ovsdb_convert_table(txn, src_table, dst_table); + if (error) { + goto error; + } + } + + error = ovsdb_txn_start_commit(txn); + if (error) { + goto error; + } + + struct ovsdb_log *new; + error = ovsdb_log_replace_start(file->log, &new); + if (error) { + goto error; + } + + /* Write schema. */ + struct json *schema_json = ovsdb_schema_to_json(new_schema); + error = ovsdb_log_write(new, schema_json); + json_destroy(schema_json); + if (error) { + goto error; + } + + /* Write data. */ + struct json *txn_json = ovsdb_file_txn_to_json(txn); + if (txn_json) { + error = ovsdb_log_write(new, txn_json); + json_destroy(txn_json); + if (error) { + goto error; + } + } + + error = ovsdb_log_replace_commit(file->log, new); + if (error) { + goto error; + } + + error = ovsdb_txn_finish_commit(txn, true); + ovs_assert(!error); /* Can't happen. */ + + ovsdb_replace(file->db, new_db); + + return NULL; + +error: + ovsdb_destroy(new_db); + if (txn) { + ovsdb_txn_abort(txn); + } + return error; +} diff --git a/ovsdb/file.h b/ovsdb/file.h index a9ef0585b261..bc9b32cf6c33 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -51,4 +51,8 @@ void ovsdb_file_destroy(struct ovsdb_file *); struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); +struct ovsdb_error *ovsdb_file_convert(const struct ovsdb_file *, + const struct ovsdb_schema *) + OVS_WARN_UNUSED_RESULT; + #endif /* ovsdb/file.h */ diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index d51a56854517..df268cd4eedc 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -82,7 +82,7 @@ static void ovsdb_jsonrpc_session_send(struct ovsdb_jsonrpc_session *, /* Triggers. */ static void ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *, struct ovsdb *, - struct json *id, struct json *params); + struct jsonrpc_msg *request); static struct ovsdb_jsonrpc_trigger *ovsdb_jsonrpc_trigger_find( struct ovsdb_jsonrpc_session *, const struct json *id, size_t hash); static void ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *); @@ -936,17 +936,6 @@ ovsdb_jsonrpc_session_unlock(struct ovsdb_jsonrpc_session *s, } static struct jsonrpc_msg * -execute_transaction(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, - struct jsonrpc_msg *request) -{ - ovsdb_jsonrpc_trigger_create(s, db, request->id, request->params); - request->id = NULL; - request->params = NULL; - jsonrpc_msg_destroy(request); - return NULL; -} - -static struct jsonrpc_msg * ovsdb_jsonrpc_session_set_db_change_aware(struct ovsdb_jsonrpc_session *s, const struct jsonrpc_msg *request) { @@ -967,10 +956,11 @@ ovsdb_jsonrpc_session_got_request(struct ovsdb_jsonrpc_session *s, { struct jsonrpc_msg *reply; - if (!strcmp(request->method, "transact")) { + if (!strcmp(request->method, "transact") || + !strcmp(request->method, "convert")) { struct ovsdb *db = ovsdb_jsonrpc_lookup_db(s, request, &reply); if (!reply) { - reply = execute_transaction(s, db, request); + ovsdb_jsonrpc_trigger_create(s, db, request); } } else if (!strcmp(request->method, "monitor") || (monitor_cond_enable__ && !strcmp(request->method, @@ -1082,37 +1072,35 @@ struct ovsdb_jsonrpc_trigger { static void ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, - struct json *id, struct json *params) + struct jsonrpc_msg *request) { - struct ovsdb_jsonrpc_trigger *t; - size_t hash; - /* Check for duplicate ID. */ - hash = json_hash(id, 0); - t = ovsdb_jsonrpc_trigger_find(s, id, hash); + size_t hash = json_hash(request->id, 0); + struct ovsdb_jsonrpc_trigger *t + = ovsdb_jsonrpc_trigger_find(s, request->id, hash); if (t) { - struct jsonrpc_msg *msg; - - msg = jsonrpc_create_error(json_string_create("duplicate request ID"), - id); - ovsdb_jsonrpc_session_send(s, msg); - json_destroy(id); - json_destroy(params); + ovsdb_jsonrpc_session_send( + s, syntax_error_reply(request, "duplicate request ID")); + jsonrpc_msg_destroy(request); return; } /* Insert into trigger table. */ t = xmalloc(sizeof *t); - ovsdb_trigger_init(&s->up, db, &t->trigger, params, time_msec(), - s->read_only, s->remote->role, - jsonrpc_session_get_id(s->js)); - t->id = id; + bool disconnect_all = ovsdb_trigger_init( + &s->up, db, &t->trigger, request, time_msec(), s->read_only, + s->remote->role, jsonrpc_session_get_id(s->js)); + t->id = json_clone(request->id); hmap_insert(&s->triggers, &t->hmap_node, hash); /* Complete early if possible. */ if (ovsdb_trigger_is_complete(&t->trigger)) { ovsdb_jsonrpc_trigger_complete(t); } + + if (disconnect_all) { + ovsdb_jsonrpc_server_reconnect(s->remote->server, false); + } } static struct ovsdb_jsonrpc_trigger * @@ -1139,12 +1127,9 @@ ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *t) if (jsonrpc_session_is_connected(s->js)) { struct jsonrpc_msg *reply; - struct json *result; - result = ovsdb_trigger_steal_result(&t->trigger); - if (result) { - reply = jsonrpc_create_reply(result, t->id); - } else { + reply = ovsdb_trigger_steal_reply(&t->trigger); + if (!reply) { reply = jsonrpc_create_error(json_string_create("canceled"), t->id); } @@ -1159,7 +1144,7 @@ ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *t) static void ovsdb_jsonrpc_trigger_remove__(struct ovsdb_jsonrpc_session *s, - struct ovsdb *db) + struct ovsdb *db) { struct ovsdb_jsonrpc_trigger *t, *next; HMAP_FOR_EACH_SAFE (t, next, hmap_node, &s->triggers) { @@ -1189,11 +1174,9 @@ ovsdb_jsonrpc_trigger_complete_all(struct ovsdb_jsonrpc_session *s) static void ovsdb_jsonrpc_trigger_complete_done(struct ovsdb_jsonrpc_session *s) { - while (!ovs_list_is_empty(&s->up.completions)) { - struct ovsdb_jsonrpc_trigger *t - = CONTAINER_OF(s->up.completions.next, - struct ovsdb_jsonrpc_trigger, trigger.node); - ovsdb_jsonrpc_trigger_complete(t); + struct ovsdb_jsonrpc_trigger *trigger, *next; + LIST_FOR_EACH_SAFE (trigger, next, trigger.node, &s->up.completions) { + ovsdb_jsonrpc_trigger_complete(trigger); } } @@ -1441,7 +1424,7 @@ ovsdb_jsonrpc_monitor_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, error: if (m) { - ovsdb_jsonrpc_monitor_destroy(m); + ovsdb_jsonrpc_monitor_destroy(m, false); } return jsonrpc_create_error(ovsdb_error_to_json_free(error), request_id); @@ -1598,32 +1581,12 @@ ovsdb_jsonrpc_monitor_cancel(struct ovsdb_jsonrpc_session *s, return jsonrpc_create_error(json_string_create("unknown monitor"), request_id); } else { - ovsdb_jsonrpc_monitor_destroy(m); + ovsdb_jsonrpc_monitor_destroy(m, false); return jsonrpc_create_reply(json_object_create(), request_id); } } } -static void -ovsdb_jsonrpc_monitor_remove__(struct ovsdb_jsonrpc_session *s, - struct ovsdb *db) -{ - struct ovsdb_jsonrpc_monitor *m, *next; - - HMAP_FOR_EACH_SAFE (m, next, node, &s->monitors) { - if (!db || m->db == db) { - if (db && jsonrpc_session_is_connected(s->js) - && s->db_change_aware) { - struct jsonrpc_msg *notify = jsonrpc_create_notify( - "monitor_canceled", - json_array_create_1(json_clone(m->monitor_id))); - ovsdb_jsonrpc_session_send(s, notify); - } - ovsdb_jsonrpc_monitor_destroy(m); - } - } -} - /* Database 'db' is about to be removed from the database server. To prepare, * this function removes all references from monitors in 's' to 'db'. */ static void @@ -1631,14 +1594,24 @@ ovsdb_jsonrpc_monitor_preremove_db(struct ovsdb_jsonrpc_session *s, struct ovsdb *db) { ovs_assert(db); - ovsdb_jsonrpc_monitor_remove__(s, db); + + struct ovsdb_jsonrpc_monitor *m, *next; + HMAP_FOR_EACH_SAFE (m, next, node, &s->monitors) { + if (m->db == db) { + ovsdb_jsonrpc_monitor_destroy(m, true); + } + } } /* Cancels all monitors in 's'. */ static void ovsdb_jsonrpc_monitor_remove_all(struct ovsdb_jsonrpc_session *s) { - ovsdb_jsonrpc_monitor_remove__(s, NULL); + struct ovsdb_jsonrpc_monitor *m, *next; + + HMAP_FOR_EACH_SAFE (m, next, node, &s->monitors) { + ovsdb_jsonrpc_monitor_destroy(m, false); + } } static struct json * @@ -1669,8 +1642,19 @@ ovsdb_jsonrpc_monitor_needs_flush(struct ovsdb_jsonrpc_session *s) } void -ovsdb_jsonrpc_monitor_destroy(struct ovsdb_jsonrpc_monitor *m) -{ +ovsdb_jsonrpc_monitor_destroy(struct ovsdb_jsonrpc_monitor *m, + bool notify_cancellation) +{ + if (notify_cancellation) { + struct ovsdb_jsonrpc_session *s = m->session; + if (jsonrpc_session_is_connected(s->js) && s->db_change_aware) { + struct jsonrpc_msg *notify = jsonrpc_create_notify( + "monitor_canceled", + json_array_create_1(json_clone(m->monitor_id))); + ovsdb_jsonrpc_session_send(s, notify); + } + } + json_destroy(m->monitor_id); hmap_remove(&m->session->monitors, &m->node); ovsdb_monitor_remove_jsonrpc_monitor(m->dbmon, m, m->unflushed); diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 50a8b879c5a9..0fc16f21b2d9 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -80,7 +80,8 @@ const struct uuid *ovsdb_jsonrpc_server_get_uuid( const struct ovsdb_jsonrpc_server *); struct ovsdb_jsonrpc_monitor; -void ovsdb_jsonrpc_monitor_destroy(struct ovsdb_jsonrpc_monitor *); +void ovsdb_jsonrpc_monitor_destroy(struct ovsdb_jsonrpc_monitor *, + bool notify_cancellation); void ovsdb_jsonrpc_disable_monitor_cond(void); #endif /* ovsdb/jsonrpc-server.h */ diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 3e58c3fbd274..97706932614c 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -1613,7 +1613,7 @@ ovsdb_monitors_remove(struct ovsdb *db) * end monitor will also destroy the corresponding 'ovsdb_monitor'. * ovsdb monitor will also be destroied. */ LIST_FOR_EACH_SAFE (jm, next_jm, node, &m->jsonrpc_monitors) { - ovsdb_jsonrpc_monitor_destroy(jm->jsonrpc_monitor); + ovsdb_jsonrpc_monitor_destroy(jm->jsonrpc_monitor, false); } } } @@ -1630,3 +1630,20 @@ ovsdb_monitor_get_memory_usage(struct simap *usage) simap_increase(usage, "json-caches", hmap_count(&dbmon->json_cache)); } } + +void +ovsdb_monitor_prereplace_db(struct ovsdb *db) +{ + struct ovsdb_monitor *m, *next_m; + + LIST_FOR_EACH_SAFE (m, next_m, list_node, &db->monitors) { + struct jsonrpc_monitor_node *jm, *next_jm; + + /* Delete all front end monitors. Removing the last front + * end monitor will also destroy the corresponding 'ovsdb_monitor'. + * ovsdb monitor will also be destroied. */ + LIST_FOR_EACH_SAFE (jm, next_jm, node, &m->jsonrpc_monitors) { + ovsdb_jsonrpc_monitor_destroy(jm->jsonrpc_monitor, true); + } + } +} diff --git a/ovsdb/monitor.h b/ovsdb/monitor.h index 99d43c45dff9..eb3ff270c9f3 100644 --- a/ovsdb/monitor.h +++ b/ovsdb/monitor.h @@ -49,6 +49,8 @@ struct ovsdb_monitor *ovsdb_monitor_create(struct ovsdb *db, void ovsdb_monitors_remove(struct ovsdb *); void ovsdb_monitors_commit(struct ovsdb *, const struct ovsdb_txn *); +void ovsdb_monitor_prereplace_db(struct ovsdb *); + struct ovsdb_monitor *ovsdb_monitor_add(struct ovsdb_monitor *dbmon); void ovsdb_monitor_add_jsonrpc_monitor(struct ovsdb_monitor *dbmon, diff --git a/ovsdb/ovsdb-client.1.in b/ovsdb/ovsdb-client.1.in index 5dbd49f25263..56d4797e933c 100644 --- a/ovsdb/ovsdb-client.1.in +++ b/ovsdb/ovsdb-client.1.in @@ -22,6 +22,9 @@ ovsdb\-client \- command-line interface to \fBovsdb-server\fR(1) .br \fBovsdb\-client\fR [\fIoptions\fR] \fBlist\-columns\fR [\fIserver\fR] [\fIdatabase\fR] [\fItable\fR] .IP "Database Version Management Commands:" +\fBovsdb\-client \fR[\fIoptions\fR] \fBconvert \fR[\fIserver\fR] \fIschema\fR +.br +\fBovsdb\-client \fR[\fIoptions\fR] \fBneeds\-conversion \fR[\fIserver\fR] \fIschema\fR .br \fBovsdb\-client\fR [\fIoptions\fR] \fBget\-schema\-version\fR [\fIserver\fR] [\fIdatabase\fR] .IP "Data Management Commands:" @@ -117,7 +120,43 @@ listed; otherwise, the tables include columns in all tables. These commands work with different versions of OVSDB schemas and databases. . -.IP "\fBget\-schema\-version\fR [\fIserver\fR] [\fIdatabase\fR]" +.IP "\fBconvert \fR[\fIserver\fR] \fIschema\fR" +Reads an OVSDB schema in JSON format, as specified in the OVSDB +specification, from \fIschema\fR, then connects to \fIserver\fR and +requests the server to convert the database whose name is specified in +\fIschema\fR to the schema also specified in \fIschema\fR. +.IP +The conversion is atomic, consistent, isolated, and durable. +Following the schema change, the server notifies clients that use the +\fBset_db_change_aware\fR RPC introduced in Open vSwitch 2.9 and +cancels their outstanding transactions and monitors. The server +disconnects other clients, enabling them to notice the change when +they reconnect. +.IP +This command can do simple ``upgrades'' and ``downgrades'' on a +database's schema. The data in the database must be valid when +interpreted under \fIschema\fR, with only one exception: data for +tables and columns that do not exist in \fIschema\fR are ignored. +Columns that exist in \fIschema\fR but not in the database are set to +their default values. All of \fIschema\fR's constraints apply in +full. +.IP +Some uses of this command can cause unrecoverable data loss. For +example, converting a database from a schema that has a given column +or table to one that does not will delete all data in that column or +table. Back up critical databases before converting them. +.IP +This command works with clustered and standalone databases. +Standalone databases may also be converted (offline) with +\fBovsdb\-tool\fR's \fBconvert\fR command. +. +.IP "\fBneeds\-conversion \fR[\fIserver\fR] \fIschema\fR" +Reads the schema from \fIschema\fR, then connects to \fIserver\fR and +requests the schema from the database whose name is specified in +\fIschema\fR. If the two schemas are the same, prints \fBno\fR on +stdout; if they differ, prints \fByes\fR. +. +.IP "\fBget\-schema\-version \fR[\fIserver\fR] [\fIdatabase\fR]" Connects to \fIserver\fR, retrieves the schema for \fIdatabase\fR, and prints its version number on stdout. If \fIdatabase\fR was created before schema versioning was introduced, diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index 600c5070db78..a7cab600c98b 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -77,9 +77,14 @@ static bool timestamp; /* --db-change-aware, --no-db-change-aware: Enable db_change_aware feature for * "monitor" command? * - * (This option is undocumented because it is expected to be useful only for - * testing that the db_change_aware feature actually works.) */ -static int db_change_aware; + * -1 (default): Use db_change_aware if available. + * 0: Disable db_change_aware. + * 1: Require db_change_aware. + * + * (This option is undocumented because anything other than the default is + * expected to be useful only for testing that the db_change_aware feature + * actually works.) */ +static int db_change_aware = -1; /* --force: Ignore schema differences for "restore" command? */ static bool force; @@ -303,6 +308,8 @@ usage(void) " DATABASE on SERVER.\n" " COLUMNs may include !initial, !insert, !delete, !modify\n" " to avoid seeing the specified kinds of changes.\n" + "\n convert [SERVER] SCHEMA\n" + " convert database on SERVER named in SCHEMA to SCHEMA.\n" "\n monitor [SERVER] [DATABASE] ALL\n" " monitor all changes to all columns in all tables\n" " in DATBASE on SERVER.\n" @@ -557,11 +564,40 @@ do_list_columns(struct jsonrpc *rpc, const char *database, table_destroy(&t); } +static void +send_db_change_aware(struct jsonrpc *rpc) +{ + if (db_change_aware != 0) { + struct jsonrpc_msg *request = jsonrpc_create_request( + "set_db_change_aware", + json_array_create_1(json_boolean_create(true)), + NULL); + struct jsonrpc_msg *reply; + int error = jsonrpc_transact_block(rpc, request, &reply); + if (error) { + ovs_fatal(error, "%s: error setting db_change_aware", + jsonrpc_get_name(rpc)); + } + if (reply->type == JSONRPC_ERROR && db_change_aware == 1) { + ovs_fatal(0, "%s: set_db_change_aware failed (%s)", + jsonrpc_get_name(rpc), json_to_string(reply->error, 0)); + } + jsonrpc_msg_destroy(reply); + } +} + static struct json * do_transact__(struct jsonrpc *rpc, struct json *transaction) { struct jsonrpc_msg *request, *reply; + if (db_change_aware == 1) { + send_db_change_aware(rpc); + } + daemon_save_fd(STDOUT_FILENO); + daemon_save_fd(STDERR_FILENO); + daemonize(); + request = jsonrpc_create_request("transact", transaction, NULL); check_txn(jsonrpc_transact_block(rpc, request, &reply), &reply); struct json *result = json_clone(reply->result); @@ -1040,6 +1076,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database, ovs_assert(version < OVSDB_MONITOR_VERSION_MAX); daemon_save_fd(STDOUT_FILENO); + daemon_save_fd(STDERR_FILENO); daemonize_start(false); if (get_detach()) { int error; @@ -1097,22 +1134,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database, free(nodes); } - if (db_change_aware) { - struct jsonrpc_msg *request = jsonrpc_create_request( - "set_db_change_aware", - json_array_create_1(json_boolean_create(true)), - NULL); - struct jsonrpc_msg *reply; - int error = jsonrpc_transact_block(rpc, request, &reply); - if (error) { - ovs_fatal(error, "%s: error setting db_change_aware", server); - } - if (reply->type == JSONRPC_ERROR) { - ovs_fatal(0, "%s: set_db_change_aware failed (%s)", - server, json_to_string(reply->error, 0)); - } - jsonrpc_msg_destroy(reply); - } + send_db_change_aware(rpc); monitor = json_array_create_3(json_string_create(database), json_null_create(), monitor_requests); @@ -1174,6 +1196,10 @@ do_monitor__(struct jsonrpc *rpc, const char *database, monitor2_print(params->u.array.elems[1], mts, n_mts); fflush(stdout); } + } else if (msg->type == JSONRPC_NOTIFY + && !strcmp(msg->method, "monitor_canceled")) { + ovs_fatal(0, "%s: %s database was removed", + server, database); } jsonrpc_msg_destroy(msg); } @@ -1229,6 +1255,35 @@ do_monitor_cond(struct jsonrpc *rpc, const char *database, ovsdb_schema_destroy(schema); } +static void +do_convert(struct jsonrpc *rpc, const char *database OVS_UNUSED, + int argc OVS_UNUSED, char *argv[]) +{ + struct ovsdb_schema *new_schema; + check_ovsdb_error(ovsdb_schema_from_file(argv[0], &new_schema)); + + struct jsonrpc_msg *request, *reply; + request = jsonrpc_create_request( + "convert", + json_array_create_2(json_string_create(new_schema->name), + ovsdb_schema_to_json(new_schema)), NULL); + check_txn(jsonrpc_transact_block(rpc, request, &reply), &reply); + jsonrpc_msg_destroy(reply); +} + +static void +do_needs_conversion(struct jsonrpc *rpc, const char *database OVS_UNUSED, + int argc OVS_UNUSED, char *argv[]) +{ + struct ovsdb_schema *schema1; + check_ovsdb_error(ovsdb_schema_from_file(argv[0], &schema1)); + + struct ovsdb_schema *schema2 = fetch_schema(rpc, schema1->name); + puts(ovsdb_schema_equal(schema1, schema2) ? "no" : "yes"); + ovsdb_schema_destroy(schema1); + ovsdb_schema_destroy(schema2); +} + struct dump_table_aux { struct ovsdb_datum **data; const struct ovsdb_column **columns; @@ -1910,6 +1965,8 @@ static const struct ovsdb_client_command all_commands[] = { { "query", NEED_RPC, 1, 1, do_query }, { "monitor", NEED_DATABASE, 1, INT_MAX, do_monitor }, { "monitor-cond", NEED_DATABASE, 2, 3, do_monitor_cond }, + { "convert", NEED_RPC, 1, 1, do_convert }, + { "needs-conversion", NEED_RPC, 1, 1, do_needs_conversion }, { "dump", NEED_DATABASE, 0, INT_MAX, do_dump }, { "backup", NEED_DATABASE, 0, 0, do_backup }, { "restore", NEED_DATABASE, 0, 0, do_restore }, diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 1e36b27958f8..f7bf1e270120 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -111,10 +111,11 @@ static char *open_db(struct server_config *config, const char *filename); static void add_server_db(struct server_config *); static void close_db(struct db *db); -static void parse_options(int *argc, char **argvp[], - struct sset *remotes, char **unixctl_pathp, - char **run_command, char **sync_from, - char **sync_exclude, bool *is_backup); +static void parse_options(int argc, char *argvp[], + struct sset *db_filenames, struct sset *remotes, + char **unixctl_pathp, char **run_command, + char **sync_from, char **sync_exclude, + bool *is_backup); OVS_NO_RETURN static void usage(void); static char *reconfigure_remotes(struct ovsdb_jsonrpc_server *, @@ -202,7 +203,9 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, SHASH_FOR_EACH(node, all_dbs) { struct db *db = node->data; - ovsdb_trigger_run(db->db, time_msec()); + if (ovsdb_trigger_run(db->db, time_msec())) { + ovsdb_jsonrpc_server_reconnect(jsonrpc, false); + } } if (run_process) { process_run(); @@ -265,7 +268,6 @@ main(int argc, char *argv[]) struct shash all_dbs; struct shash_node *node, *next; char *error; - int i; ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); @@ -274,8 +276,8 @@ main(int argc, char *argv[]) process_init(); bool active = false; - parse_options(&argc, &argv, &remotes, &unixctl_path, &run_command, - &sync_from, &sync_exclude, &active); + parse_options(argc, argv, &db_filenames, &remotes, &unixctl_path, + &run_command, &sync_from, &sync_exclude, &active); is_backup = sync_from && !active; daemon_become_new_user(false); @@ -290,17 +292,6 @@ main(int argc, char *argv[]) ovs_fatal(errno, "failed to create temporary file"); } - sset_init(&db_filenames); - if (argc > 0) { - for (i = 0; i < argc; i++) { - sset_add(&db_filenames, argv[i]); - } - } else { - char *default_db = xasprintf("%s/conf.db", ovs_dbdir()); - sset_add(&db_filenames, default_db); - free(default_db); - } - server_config.remotes = &remotes; server_config.config_tmpfile = config_tmpfile; @@ -1477,8 +1468,9 @@ ovsdb_server_get_sync_status(struct unixctl_conn *conn, int argc OVS_UNUSED, } static void -parse_options(int *argcp, char **argvp[], - struct sset *remotes, char **unixctl_pathp, char **run_command, +parse_options(int argc, char *argv[], + struct sset *db_filenames, struct sset *remotes, + char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *active) { enum { @@ -1490,10 +1482,12 @@ parse_options(int *argcp, char **argvp[], OPT_SYNC_FROM, OPT_SYNC_EXCLUDE, OPT_ACTIVE, + OPT_NO_DBS, VLOG_OPTION_ENUMS, DAEMON_OPTION_ENUMS, SSL_OPTION_ENUMS, }; + static const struct option long_options[] = { {"remote", required_argument, NULL, OPT_REMOTE}, {"unixctl", required_argument, NULL, OPT_UNIXCTL}, @@ -1510,14 +1504,15 @@ parse_options(int *argcp, char **argvp[], {"sync-from", required_argument, NULL, OPT_SYNC_FROM}, {"sync-exclude-tables", required_argument, NULL, OPT_SYNC_EXCLUDE}, {"active", no_argument, NULL, OPT_ACTIVE}, + {"no-dbs", no_argument, NULL, OPT_NO_DBS}, {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); - int argc = *argcp; - char **argv = *argvp; + bool add_default_db = true; *sync_from = NULL; *sync_exclude = NULL; + sset_init(db_filenames); sset_init(remotes); for (;;) { int c; @@ -1596,6 +1591,10 @@ parse_options(int *argcp, char **argvp[], *active = true; break; + case OPT_NO_DBS: + add_default_db = false; + break; + case '?': exit(EXIT_FAILURE); @@ -1605,8 +1604,15 @@ parse_options(int *argcp, char **argvp[], } free(short_options); - *argcp -= optind; - *argvp += optind; + argc -= optind; + argv += optind; + if (argc > 0) { + for (int i = 0; i < argc; i++) { + sset_add(db_filenames, argv[i]); + } + } else if (add_default_db) { + sset_add_and_free(db_filenames, xasprintf("%s/conf.db", ovs_dbdir())); + } } static void diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 19755e673861..89f530bcccfb 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -27,6 +27,7 @@ #include "simap.h" #include "table.h" #include "transaction.h" +#include "trigger.h" struct ovsdb_schema * ovsdb_schema_create(const char *name, const char *version, const char *cksum) @@ -162,7 +163,7 @@ root_set_size(const struct ovsdb_schema *schema) } struct ovsdb_error * -ovsdb_schema_from_json(struct json *json, struct ovsdb_schema **schemap) +ovsdb_schema_from_json(const struct json *json, struct ovsdb_schema **schemap) { struct ovsdb_schema *schema; const struct json *name, *tables, *version_json, *cksum; @@ -361,6 +362,29 @@ ovsdb_create(struct ovsdb_schema *schema) } void +ovsdb_replace(struct ovsdb *dst, struct ovsdb *src) +{ + /* Cancel monitors. */ + ovsdb_monitor_prereplace_db(dst); + + /* Cancel triggers. */ + struct ovsdb_trigger *trigger, *next; + LIST_FOR_EACH_SAFE (trigger, next, node, &dst->triggers) { + ovsdb_trigger_prereplace_db(trigger); + } + + struct ovsdb_schema *tmp_schema = dst->schema; + dst->schema = src->schema; + src->schema = tmp_schema; + + shash_swap(&dst->tables, &src->tables); + + dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role"); + + ovsdb_destroy(src); +} + +void ovsdb_destroy(struct ovsdb *db) { if (db) { diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index 9d915f0f15ae..c3e8f2091e35 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -45,7 +45,7 @@ void ovsdb_schema_destroy(struct ovsdb_schema *); struct ovsdb_error *ovsdb_schema_from_file(const char *file_name, struct ovsdb_schema **) OVS_WARN_UNUSED_RESULT; -struct ovsdb_error *ovsdb_schema_from_json(struct json *, +struct ovsdb_error *ovsdb_schema_from_json(const struct json *, struct ovsdb_schema **) OVS_WARN_UNUSED_RESULT; struct json *ovsdb_schema_to_json(const struct ovsdb_schema *); @@ -68,6 +68,7 @@ struct ovsdb { }; struct ovsdb *ovsdb_create(struct ovsdb_schema *); +void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src); void ovsdb_destroy(struct ovsdb *); void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage); diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index f1502ffb398c..893ea1152c5a 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -806,8 +806,14 @@ update_version(struct ovsdb_txn *txn OVS_UNUSED, struct ovsdb_txn_row *txn_row) return NULL; } -static struct ovsdb_error * -ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) +static bool +ovsdb_txn_is_empty(const struct ovsdb_txn *txn) +{ + return ovs_list_is_empty(&txn->txn_tables); +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_txn_start_commit(struct ovsdb_txn *txn) { struct ovsdb_error *error; @@ -818,29 +824,25 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) ovsdb_txn_abort(txn); return OVSDB_WRAP_BUG("can't happen", error); } - if (ovs_list_is_empty(&txn->txn_tables)) { - ovsdb_txn_abort(txn); + if (ovsdb_txn_is_empty(txn)) { return NULL; } /* Update reference counts and check referential integrity. */ error = update_ref_counts(txn); if (error) { - ovsdb_txn_abort(txn); return error; } /* Delete unreferenced, non-root rows. */ error = for_each_txn_row(txn, collect_garbage); if (error) { - ovsdb_txn_abort(txn); return OVSDB_WRAP_BUG("can't happen", error); } /* Check maximum rows table constraints. */ error = check_max_rows(txn); if (error) { - ovsdb_txn_abort(txn); return error; } @@ -848,14 +850,12 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) * integrity. */ error = for_each_txn_row(txn, assess_weak_refs); if (error) { - ovsdb_txn_abort(txn); return error; } /* Verify that the indexes will still be unique post-transaction. */ error = for_each_txn_row(txn, check_index_uniqueness); if (error) { - ovsdb_txn_abort(txn); return error; } @@ -865,9 +865,16 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) return OVSDB_WRAP_BUG("can't happen", error); } + return NULL; +} + +struct ovsdb_error * +ovsdb_txn_finish_commit(struct ovsdb_txn *txn, bool durable) +{ /* Send the commit to each replica. */ if (txn->db->file) { - error = ovsdb_file_commit(txn->db->file, txn, durable); + struct ovsdb_error *error = ovsdb_file_commit(txn->db->file, txn, + durable); if (error) { ovsdb_txn_abort(txn); return error; @@ -887,10 +894,12 @@ ovsdb_txn_commit_(struct ovsdb_txn *txn, bool durable) struct ovsdb_error * ovsdb_txn_commit(struct ovsdb_txn *txn, bool durable) { - struct ovsdb_error *err; - - PERF(__func__, err = ovsdb_txn_commit_(txn, durable)); - return err; + struct ovsdb_error *error = ovsdb_txn_start_commit(txn); + if (error || ovsdb_txn_is_empty(txn)) { + ovsdb_txn_abort(txn); + return error; + } + return ovsdb_txn_finish_commit(txn, durable); } void diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index 1ecd15a56a8d..f9b886411bf4 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -26,6 +26,11 @@ struct uuid; struct ovsdb_txn *ovsdb_txn_create(struct ovsdb *); void ovsdb_txn_abort(struct ovsdb_txn *); + +struct ovsdb_error *ovsdb_txn_start_commit(struct ovsdb_txn *) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *ovsdb_txn_finish_commit(struct ovsdb_txn *, bool durable) + OVS_WARN_UNUSED_RESULT; struct ovsdb_error *ovsdb_txn_commit(struct ovsdb_txn *, bool durable) OVS_WARN_UNUSED_RESULT; diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 165cd6ebbdd1..346db7b5fb28 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -19,42 +19,48 @@ #include +#include "file.h" +#include "log.h" #include "openvswitch/json.h" #include "jsonrpc.h" #include "ovsdb.h" +#include "ovsdb-error.h" #include "openvswitch/poll-loop.h" #include "server.h" #include "util.h" + static bool ovsdb_trigger_try(struct ovsdb_trigger *, long long int now); -static void ovsdb_trigger_complete(struct ovsdb_trigger *); +static void trigger_error(struct ovsdb_trigger *, struct ovsdb_error *); +static void trigger_success(struct ovsdb_trigger *, struct json *result); -void +bool ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db, struct ovsdb_trigger *trigger, - struct json *request, long long int now, - bool read_only, const char *role, - const char *id) + struct jsonrpc_msg *request, long long int now, + bool read_only, const char *role, const char *id) { + ovs_assert(!strcmp(request->method, "transact") || + !strcmp(request->method, "convert")); trigger->session = session; trigger->db = db; ovs_list_push_back(&trigger->db->triggers, &trigger->node); trigger->request = request; - trigger->result = NULL; + trigger->reply = NULL; trigger->created = now; trigger->timeout_msec = LLONG_MAX; trigger->read_only = read_only; trigger->role = nullable_xstrdup(role); trigger->id = nullable_xstrdup(id); - ovsdb_trigger_try(trigger, now); + return ovsdb_trigger_try(trigger, now); } void ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) { ovs_list_remove(&trigger->node); - json_destroy(trigger->request); - json_destroy(trigger->result); + jsonrpc_msg_destroy(trigger->request); + jsonrpc_msg_destroy(trigger->reply); free(trigger->role); free(trigger->id); } @@ -62,30 +68,53 @@ ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) bool ovsdb_trigger_is_complete(const struct ovsdb_trigger *trigger) { - return trigger->result != NULL; + return trigger->reply != NULL; } -struct json * -ovsdb_trigger_steal_result(struct ovsdb_trigger *trigger) +struct jsonrpc_msg * +ovsdb_trigger_steal_reply(struct ovsdb_trigger *trigger) { - struct json *result = trigger->result; - trigger->result = NULL; - return result; + struct jsonrpc_msg *reply = trigger->reply; + trigger->reply = NULL; + return reply; } void +ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger) +{ + if (!strcmp(trigger->request->method, "transact")) { + trigger_error(trigger, ovsdb_error("canceled", NULL)); + } else if (!strcmp(trigger->request->method, "convert")) { + /* We don't cancel "convert" requests when a database is being replaced + * for two reasons. First, we expect the administrator to do some kind + * of sensible synchronization on conversion requests, that is, it only + * really makes sense for the admin to do a single conversion at a time + * at a scheduled point. Second, if we did then every "convert" + * request would end up getting canceled since "convert" itself causes + * the database to be replaced. */ + } else { + OVS_NOT_REACHED(); + } +} + +bool ovsdb_trigger_run(struct ovsdb *db, long long int now) { struct ovsdb_trigger *t, *next; - bool run_triggers; - run_triggers = db->run_triggers; + bool run_triggers = db->run_triggers; db->run_triggers = false; + + bool disconnect_all = false; + LIST_FOR_EACH_SAFE (t, next, node, &db->triggers) { if (run_triggers || now - t->created >= t->timeout_msec) { - ovsdb_trigger_try(t, now); + if (ovsdb_trigger_try(t, now)) { + disconnect_all = true; + } } } + return disconnect_all; } void @@ -118,22 +147,81 @@ ovsdb_trigger_wait(struct ovsdb *db, long long int now) static bool ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) { - t->result = ovsdb_execute(t->db, t->session, - t->request, t->read_only, - t->role, t->id, - now - t->created, &t->timeout_msec); - if (t->result) { - ovsdb_trigger_complete(t); + if (!strcmp(t->request->method, "transact")) { + struct json *result = ovsdb_execute(t->db, t->session, + t->request->params, t->read_only, + t->role, t->id, now - t->created, + &t->timeout_msec); + if (result) { + trigger_success(t, result); + } + return false; + } else if (!strcmp(t->request->method, "convert")) { + /* Permission check. */ + if (t->role && *t->role) { + trigger_error(t, ovsdb_perm_error( + "RBAC rules for client \"%s\" role \"%s\" " + "prohibit \"convert\" of database %s " + "(only the root role may convert databases)", + t->id, t->role, t->db->schema->name)); + return false; + } + + /* Validate parameters. */ + const struct json *params = t->request->params; + if (params->type != JSON_ARRAY || params->u.array.n != 2) { + trigger_error(t, ovsdb_syntax_error(params, NULL, + "array expected")); + return false; + } + + /* Parse new schema and make a converted copy. */ + const struct json *new_schema_json = params->u.array.elems[1]; + struct ovsdb_schema *new_schema; + struct ovsdb_error *error = ovsdb_schema_from_json(new_schema_json, + &new_schema); + if (!error && strcmp(new_schema->name, t->db->schema->name)) { + error = ovsdb_error( + "invalid parameters", + "new schema name (%s) does not match database name (%s)", + new_schema->name, t->db->schema->name); + } + if (!error) { + error = ovsdb_file_convert(t->db->file, new_schema); + } + ovsdb_schema_destroy(new_schema); + if (error) { + trigger_error(t, error); + return false; + } + + trigger_success(t, json_object_create()); return true; } else { - return false; + OVS_NOT_REACHED(); } } static void -ovsdb_trigger_complete(struct ovsdb_trigger *t) +ovsdb_trigger_complete(struct ovsdb_trigger *t, struct jsonrpc_msg *reply) { - ovs_assert(t->result != NULL); + ovs_assert(reply && !t->reply); + t->reply = reply; ovs_list_remove(&t->node); ovs_list_push_back(&t->session->completions, &t->node); } + +static void +trigger_error(struct ovsdb_trigger *t, struct ovsdb_error *error) +{ + struct jsonrpc_msg *reply = jsonrpc_create_error( + ovsdb_error_to_json_free(error), t->request->id); + ovsdb_trigger_complete(t, reply); +} + +static void +trigger_success(struct ovsdb_trigger *t, struct json *result) +{ + struct jsonrpc_msg *reply = jsonrpc_create_reply(result, t->request->id); + ovsdb_trigger_complete(t, reply); +} diff --git a/ovsdb/trigger.h b/ovsdb/trigger.h index 90246a4a42bd..d9df97f31222 100644 --- a/ovsdb/trigger.h +++ b/ovsdb/trigger.h @@ -25,8 +25,8 @@ struct ovsdb_trigger { struct ovsdb *db; /* Database on which trigger acts. */ struct ovs_list node; /* !result: in db->triggers; * result: in session->completions. */ - struct json *request; /* Database request. */ - struct json *result; /* Result (null if none yet). */ + struct jsonrpc_msg *request; /* Database request. */ + struct jsonrpc_msg *reply; /* Result (null if none yet).. */ long long int created; /* Time created. */ long long int timeout_msec; /* Max wait duration. */ bool read_only; /* Database is in read only mode. */ @@ -34,17 +34,18 @@ struct ovsdb_trigger { char *id; /* ID, for role-based access controls. */ }; -void ovsdb_trigger_init(struct ovsdb_session *, struct ovsdb *, +bool ovsdb_trigger_init(struct ovsdb_session *, struct ovsdb *, struct ovsdb_trigger *, - struct json *request, long long int now, - bool read_only, const char *role, - const char *id); + struct jsonrpc_msg *request, long long int now, + bool read_only, const char *role, const char *id); void ovsdb_trigger_destroy(struct ovsdb_trigger *); bool ovsdb_trigger_is_complete(const struct ovsdb_trigger *); -struct json *ovsdb_trigger_steal_result(struct ovsdb_trigger *); +struct jsonrpc_msg *ovsdb_trigger_steal_reply(struct ovsdb_trigger *); -void ovsdb_trigger_run(struct ovsdb *, long long int now); +void ovsdb_trigger_prereplace_db(struct ovsdb_trigger *); + +bool ovsdb_trigger_run(struct ovsdb *, long long int now); void ovsdb_trigger_wait(struct ovsdb *, long long int now); #endif /* ovsdb/trigger.h */ diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 2434f43cb761..917a5cc09ace 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -29,11 +29,11 @@ m4_define([OVSDB_CHECK_MONITOR], on_exit 'kill `cat ovsdb-server.pid`' AT_CAPTURE_FILE([ovsdb-client-log]) if test "$IS_WIN32" = "yes"; then - AT_CHECK([ovsdb-client -vjsonrpc --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output 2>/dev/null &], + AT_CHECK([ovsdb-client -vjsonrpc --detach --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output], [0], [ignore], [ignore]) sleep 1 else - AT_CHECK([ovsdb-client -vjsonrpc --detach --no-chdir --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output], + AT_CHECK([ovsdb-client -vjsonrpc --detach --no-chdir --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output 2>/dev/null], [0], [ignore], [ignore]) fi on_exit 'kill `cat ovsdb-client.pid`' diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 2e3d8ad14636..54ff04ef3146 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -790,6 +790,269 @@ _uuid name number OVSDB_SERVER_SHUTDOWN AT_CLEANUP +AT_SETUP([schema conversion online]) +AT_KEYWORDS([ovsdb server convert needs-conversion]) +on_exit 'kill `cat *.pid`' +ordinal_schema > schema +AT_DATA([new-schema], + [[{"name": "ordinals", + "tables": { + "ordinals": { + "columns": { + "number": {"type": "integer"}}}}} +]]) +dnl Make sure that "ovsdb-tool create" works with a dangling symlink for +dnl the database and the lockfile, creating the target of each symlink rather +dnl than replacing the symlinks with regular files. +mkdir dir +if test "$IS_WIN32" = "no"; then + ln -s dir/db db + ln -s dir/.db.~lock~ .db.~lock~ + AT_SKIP_IF([test ! -h db || test ! -h .db.~lock~]) +fi +AT_CHECK([ovsdb-tool create db schema]) +dnl Put some data in the database. +AT_CHECK( + [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do + set -- $pair + ovsdb-tool transact db ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"name": "'$1'", "number": '$2'}}, + {"op": "comment", + "comment": "add row for '"$pair"'"}]' + done | uuidfilt]], [0], +[[[{"uuid":["uuid","<0>"]},{}] +[{"uuid":["uuid","<1>"]},{}] +[{"uuid":["uuid","<2>"]},{}] +[{"uuid":["uuid","<3>"]},{}] +[{"uuid":["uuid","<4>"]},{}] +[{"uuid":["uuid","<5>"]},{}] +]], [ignore]) + +dnl Start the database server. +AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0]) +AT_CAPTURE_FILE([ovsdb-server.log]) + +dnl Try "needs-conversion". +AT_CHECK([ovsdb-client needs-conversion schema], [0], [no +]) +AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [yes +]) + +dnl Start two monitors on the 'ordinals' db, one that is database +dnl change aware and one that is not. +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-aware.pid --log-file=monitor-ordinals-aware.log --db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-aware.stdout 2> monitor-ordinals-aware.stderr]) +AT_CAPTURE_FILE([monitor-ordinals-aware.stdout]) +AT_CAPTURE_FILE([monitor-ordinals-aware.log]) +AT_CAPTURE_FILE([monitor-ordinals-aware.stderr]) + +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-unaware.pid --log-file=monitor-ordinals-unaware.log --no-db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-unaware.stdout 2> monitor-ordinals-unaware.stderr]) +AT_CAPTURE_FILE([monitor-ordinals-unaware.stdout]) +AT_CAPTURE_FILE([monitor-ordinals-unaware.log]) +AT_CAPTURE_FILE([monitor-ordinals-unaware.stderr]) + +dnl Start two monitors on the '_Server' db, one that is database +dnl change aware and one that is not. +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-aware.pid --log-file=monitor-server-aware.log --db-change-aware --no-headings monitor _Server Database name > monitor-server-aware.stdout 2> monitor-server-aware.stderr]) +AT_CAPTURE_FILE([monitor-server-aware.stdout]) +AT_CAPTURE_FILE([monitor-server-aware.log]) +AT_CAPTURE_FILE([monitor-server-aware.stderr]) + +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-unaware.pid --log-file=monitor-server-unaware.log --no-db-change-aware --no-headings monitor _Server Database name > monitor-server-unaware.stdout 2> monitor-server-unaware.stderr]) +AT_CAPTURE_FILE([monitor-server-unaware.stdout]) +AT_CAPTURE_FILE([monitor-server-unaware.log]) +AT_CAPTURE_FILE([monitor-server-unaware.stderr]) + +dnl Start two long-running transactions (triggers) on the 'ordinals' db, +dnl one that is database change aware and one that is not. +ordinals_txn='[["ordinals", + {"op": "wait", + "table": "ordinals", + "where": [["name", "==", "seven"]], + "columns": ["name", "number"], + "rows": [], + "until": "!="}]]' +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-aware.pid --log-file=trigger-ordinals-aware.log --db-change-aware transact "$ordinals_txn" > trigger-ordinals-aware.stdout 2> trigger-ordinals-aware.stderr]) +AT_CAPTURE_FILE([trigger-ordinals-aware.stdout]) +AT_CAPTURE_FILE([trigger-ordinals-aware.log]) +AT_CAPTURE_FILE([trigger-ordinals-aware.stderr]) + +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-unaware.pid --log-file=trigger-ordinals-unaware.log --no-db-change-aware transact "$ordinals_txn" > trigger-ordinals-unaware.stdout 2> trigger-ordinals-unaware.stderr]) +AT_CAPTURE_FILE([trigger-ordinals-unaware.stdout]) +AT_CAPTURE_FILE([trigger-ordinals-unaware.log]) +AT_CAPTURE_FILE([trigger-ordinals-unaware.stderr]) + +dnl Start two long-running transactions (triggers) on the _Server db, +dnl one that is database change aware and one that is not. +server_txn='[["_Server", + {"op": "wait", + "table": "Database", + "where": [["name", "==", "xyzzy"]], + "columns": ["name"], + "rows": [], + "until": "!="}]]' +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-aware.pid --log-file=trigger-server-aware.log --db-change-aware transact "$server_txn" > trigger-server-aware.stdout 2> trigger-server-aware.stderr]) +AT_CAPTURE_FILE([trigger-server-aware.stdout]) +AT_CAPTURE_FILE([trigger-server-aware.log]) +AT_CAPTURE_FILE([trigger-server-aware.stderr]) + +AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-unaware.pid --log-file=trigger-server-unaware.log --no-db-change-aware transact "$server_txn" > trigger-server-unaware.stdout 2> trigger-server-unaware.stderr]) +AT_CAPTURE_FILE([trigger-server-unaware.stdout]) +AT_CAPTURE_FILE([trigger-server-unaware.log]) +AT_CAPTURE_FILE([trigger-server-unaware.stderr]) + +dnl Dump out and check the actual database contents. +AT_CHECK([ovsdb-client dump unix:db.sock ordinals], [0], [stdout]) +AT_CHECK([uuidfilt stdout], [0], [dnl +ordinals table +_uuid name number +------------------------------------ ----- ------ +<0> five 5 +<1> four 4 +<2> one 1 +<3> three 3 +<4> two 2 +<5> zero 0 +]) + +dnl Convert the database. +AT_CHECK([ovsdb-client convert new-schema]) + +dnl Try "needs-conversion". +AT_CHECK([ovsdb-client needs-conversion schema], [0], [yes +]) +AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [no +]) + +dnl Verify that the "ordinals" monitors behaved as they should have. +dnl Both should have exited, for different reasons. +dnl The db-aware _Server monitor should still be running, but not the unaware +dnl one. +for x in unaware aware; do + OVS_WAIT_WHILE([test -e monitor-ordinals-$x.pid]) + AT_CHECK([sort -k 3 monitor-ordinals-$x.stdout | uuidfilt], [0], +[<0> initial 0 zero +<1> initial 1 one +<2> initial 2 two +<3> initial 3 three +<4> initial 4 four +<5> initial 5 five +]) +done +AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) +]) +AT_CHECK([sed 's/.*: //' monitor-ordinals-aware.stderr], [0], [ordinals database was removed +]) + +dnl Verify that the _Server monitors behaved as they should have. +dnl The db-aware monitor should still be running, but not the unaware one. +for x in aware unaware; do + AT_CHECK([sort -k 3 monitor-server-$x.stdout | uuidfilt], [0], +[<0> initial _Server +<1> initial ordinals +]) +done +OVS_WAIT_WHILE([test -e monitor-server-unaware.pid]) +AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) +]) +AT_CHECK([test -e monitor-server-aware.pid]) + +dnl Verify that the "ordinals" triggers behaved as they should have: +dnl Both should have exited, for different reasons. +for x in unaware aware; do + OVS_WAIT_WHILE([test -e trigger-ordinals-$x.pid]) + AT_CHECK([cat trigger-ordinals-$x.stdout]) +done +AT_CHECK([cat trigger-ordinals-unaware.stderr], [0], [ovsdb-client: transaction failed (End of file) +]) +AT_CHECK([cat trigger-ordinals-aware.stderr], [0], [ovsdb-client: transaction returned error: {"error":"canceled"} +]) + +dnl Verify that the _Server triggers behaved as they should have: +dnl The db-aware trigger should still be waiting, but not the unaware one. +for x in aware unaware; do + AT_CHECK([cat trigger-server-$x.stdout]) +done +OVS_WAIT_WHILE([test -e trigger-server-unaware.pid]) +AT_CHECK([sed 's/.*: //' trigger-ordinals-unaware.stderr], [0], [transaction failed (End of file) +]) +AT_CHECK([test -e trigger-server-aware.pid]) + +dnl We can't fully re-check the contents of the database log, because the +dnl order of the records is not predictable, but there should only be 4 lines +dnl in it now. +AT_CAPTURE_FILE([db]) +AT_CHECK([test `wc -l < db` -eq 4]) +dnl And check that the dumped data is the same except for the removed column: +AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl +ordinals table +_uuid number +------------------------------------ ------ +<0> 0 +<1> 1 +<2> 2 +<3> 3 +<4> 4 +<5> 5 +]) +dnl Now check that the converted database is still online and can be modified, +dnl then check that the database log has one more record and that the data +dnl is as expected. +AT_CHECK( + [[ovsdb-client transact ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 6}}, + {"op": "comment", + "comment": "add row for 6"}]' | uuidfilt]], [0], + [[[{"uuid":["uuid","<0>"]},{}] +]]) +AT_CHECK([test `wc -l < db` -eq 6]) +AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl +ordinals table +_uuid number +------------------------------------ ------ +<0> 0 +<1> 1 +<2> 2 +<3> 3 +<4> 4 +<5> 5 +<6> 6 +]) +dnl Now kill and restart the database server to ensure that the data is +dnl correct on disk as well as in memory. +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +AT_CHECK([[ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db]], + [0]) +AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl +ordinals table +_uuid number +------------------------------------ ------ +<0> 0 +<1> 1 +<2> 2 +<3> 3 +<4> 4 +<5> 5 +<6> 6 +]) + +dnl Make sure that "db" is still a symlink to dir/db instead of getting +dnl replaced by a regular file, ditto for .db.~lock~. +if test "$IS_WIN32" = "no"; then + AT_CHECK([test -h db]) + AT_CHECK([test -h .db.~lock~]) + AT_CHECK([test -f dir/db]) + AT_CHECK([test -f dir/.db.~lock~]) +fi + +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +AT_CLEANUP + AT_SETUP([ovsdb-server combines updates on backlogged connections]) on_exit 'kill `cat *.pid`' diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index c0c5a4df51af..8502ad73ff69 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -1522,14 +1522,14 @@ struct test_trigger { static void do_trigger_dump(struct test_trigger *t, long long int now, const char *title) { - struct json *result; + struct jsonrpc_msg *reply; char *s; - result = ovsdb_trigger_steal_result(&t->trigger); - s = json_to_string(result, JSSF_SORT); + reply = ovsdb_trigger_steal_reply(&t->trigger); + s = json_to_string(reply->result, JSSF_SORT); printf("t=%lld: trigger %d (%s): %s\n", now, t->number, title, s); free(s); - json_destroy(result); + jsonrpc_msg_destroy(reply); ovsdb_trigger_destroy(&t->trigger); free(t); } @@ -1569,8 +1569,10 @@ do_trigger(struct ovs_cmdl_context *ctx) json_destroy(params); } else { struct test_trigger *t = xmalloc(sizeof *t); - ovsdb_trigger_init(&session, db, &t->trigger, params, now, false, - NULL, NULL); + ovsdb_trigger_init(&session, db, &t->trigger, + jsonrpc_create_request("transact", params, + NULL), + now, false, NULL, NULL); t->number = number++; if (ovsdb_trigger_is_complete(&t->trigger)) { do_trigger_dump(t, now, "immediate"); From patchwork Mon Jan 1 05:16:37 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854294 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95D469rbz9t84 for ; Mon, 1 Jan 2018 16:22:12 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id DB826CED; Mon, 1 Jan 2018 05:17:08 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id EB28ECE8 for ; Mon, 1 Jan 2018 05:17:07 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id E792C14B for ; Mon, 1 Jan 2018 05:17:06 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id B3A2DC5A53; Mon, 1 Jan 2018 06:17:04 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:37 -0800 Message-Id: <20180101051640.13043-12-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 12/15] ovsdb-client: Add --timeout option. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Signed-off-by: Ben Pfaff Reviewed-by: Yifeng Sun Acked-by: Justin Pettit --- NEWS | 1 + ovsdb/ovsdb-client.1.in | 6 ++++++ ovsdb/ovsdb-client.c | 12 ++++++++++++ tests/ovs-macros.at | 4 ++++ 4 files changed, 23 insertions(+) diff --git a/NEWS b/NEWS index dfc2fb7728a4..646879c61677 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,7 @@ Post-v2.8.0 ovsdb-server(5) for more details. * ovsdb-client: New "get-schema-cksum" and "query" commands. * ovsdb-client: New "backup" and "restore" commands. + * ovsdb-client: New --timeout option. * ovsdb-tool: New "db-name" and "schema-name" commands. - ovs-vsctl and other commands that display data in tables now support a --max-column-width option to limit column width. diff --git a/ovsdb/ovsdb-client.1.in b/ovsdb/ovsdb-client.1.in index 56d4797e933c..727e9c6e0651 100644 --- a/ovsdb/ovsdb-client.1.in +++ b/ovsdb/ovsdb-client.1.in @@ -313,6 +313,12 @@ table update. Most output formats add the timestamp on a line of its own just above the table. The JSON output format puts the timestamp in a member of the top-level JSON object named \fBtime\fR. . +.IP "\fB\-t\fR" +.IQ "\fB\-\-timeout=\fIsecs\fR" +Limits \fBovsdb\-client\fR runtime to approximately \fIsecs\fR +seconds. If the timeout expires, \fBovsdb\-client\fR will exit with a +\fBSIGALRM\fR signal. +. .SS "Daemon Options" The daemon options apply only to the \fBmonitor\fR and \fBmonitor\-cond\fR commands. With any other command, they have no effect. diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index a7cab600c98b..b00f04147d39 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -213,6 +213,7 @@ parse_options(int argc, char *argv[]) {"force", no_argument, NULL, OPT_FORCE}, {"db-change-aware", no_argument, &db_change_aware, 1}, {"no-db-change-aware", no_argument, &db_change_aware, 0}, + {"timeout", required_argument, NULL, 't'}, VLOG_LONG_OPTIONS, DAEMON_LONG_OPTIONS, #ifdef HAVE_OPENSSL @@ -227,6 +228,7 @@ parse_options(int argc, char *argv[]) table_style.format = TF_TABLE; for (;;) { + unsigned long int timeout; int c; c = getopt_long(argc, argv, short_options, long_options, NULL); @@ -259,6 +261,16 @@ parse_options(int argc, char *argv[]) force = true; break; + case 't': + timeout = strtoul(optarg, NULL, 10); + if (timeout <= 0) { + ovs_fatal(0, "value %s on -t or --timeout is not at least 1", + optarg); + } else { + time_alarm(timeout); + } + break; + case '?': exit(EXIT_FAILURE); diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 56d0a3bca86d..82df193871b3 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -122,6 +122,7 @@ if [ $? -eq 0 ]; then alias ovn-sbctl='OVS_SBCTL_TIMEOUT' alias ovn-nbctl='OVN_NBCTL_TIMEOUT' alias vtep-ctl='VTEP_CTL_TIMEOUT' + alias ovsdb-client='OVSDB_CLIENT_TIMEOUT' OVS_OFCTL_TIMEOUT () { command ovs-ofctl --timeout=$OVS_TIMEOUT "$@" } @@ -137,6 +138,9 @@ if [ $? -eq 0 ]; then VTEP_CTL_TIMEOUT () { command vtep-ctl --timeout=$OVS_TIMEOUT "$@" } + OVSDB_CLIENT_TIMEOUT () { + command ovsdb-client --timeout=$OVS_TIMEOUT "$@" + } fi # parent_pid PID From patchwork Mon Jan 1 05:16:38 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854295 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95Dk4p1Sz9t84 for ; Mon, 1 Jan 2018 16:22:46 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 1E217CF5; Mon, 1 Jan 2018 05:17:10 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp2.linuxfoundation.org (smtp2.linux-foundation.org [172.17.192.36]) by mail.linuxfoundation.org (Postfix) with ESMTPS id CC5F6CEB for ; Mon, 1 Jan 2018 05:17:08 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp2.linuxfoundation.org (Postfix) with ESMTPS id 813921DB35 for ; Mon, 1 Jan 2018 05:17:08 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 4D7FBC5A54; Mon, 1 Jan 2018 06:17:05 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:38 -0800 Message-Id: <20180101051640.13043-13-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp2.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 13/15] jsonrpc: Add comment for jsonrpc_msg_to_json(). X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org From a glance at the prototype it wasn't obvious that it destroyed its argument. Signed-off-by: Ben Pfaff Acked-by: Justin Pettit --- lib/jsonrpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index a8e5bc8434ad..f8786f909ac8 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -716,6 +716,9 @@ exit: return error; } +/* Returns 'm' converted to JSON suitable for sending as a JSON-RPC message. + * + * Consumes and destroys 'm'. */ struct json * jsonrpc_msg_to_json(struct jsonrpc_msg *m) { From patchwork Mon Jan 1 05:16:39 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854297 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95Fh6pmkz9t84 for ; Mon, 1 Jan 2018 16:23:36 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 0B68FD06; Mon, 1 Jan 2018 05:17:14 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp2.linuxfoundation.org (smtp2.linux-foundation.org [172.17.192.36]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 2B6CBCF6 for ; Mon, 1 Jan 2018 05:17:10 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp2.linuxfoundation.org (Postfix) with ESMTPS id D7FFB1DB35 for ; Mon, 1 Jan 2018 05:17:09 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id 93E5AC5A51; Mon, 1 Jan 2018 06:17:07 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:39 -0800 Message-Id: <20180101051640.13043-14-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> X-Spam-Status: No, score=-2.6 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp2.linux-foundation.org Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 14/15] json: Make it safe to pass null pointers to json_equal(). X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Signed-off-by: Ben Pfaff Reviewed-by: Yifeng Sun Acked-by: Justin Pettit --- lib/json.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/json.c b/lib/json.c index 5e93190b8a03..07ca87b2130f 100644 --- a/lib/json.c +++ b/lib/json.c @@ -576,9 +576,9 @@ json_equal(const struct json *a, const struct json *b) { if (a == b) { return true; - } - - if (a->type != b->type) { + } else if (!a || !b) { + return false; + } else if (a->type != b->type) { return false; } From patchwork Mon Jan 1 05:16:40 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Ben Pfaff X-Patchwork-Id: 854298 X-Patchwork-Delegate: jpettit@nicira.com Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z95GJ4lsFz9t84 for ; Mon, 1 Jan 2018 16:24:08 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 629E6CCF; Mon, 1 Jan 2018 05:17:22 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp2.linuxfoundation.org (smtp2.linux-foundation.org [172.17.192.36]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 62CB3C79 for ; Mon, 1 Jan 2018 05:17:21 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from relay2-d.mail.gandi.net (relay2-d.mail.gandi.net [217.70.183.194]) by smtp2.linuxfoundation.org (Postfix) with ESMTPS id CBC4D1DB35 for ; Mon, 1 Jan 2018 05:17:19 +0000 (UTC) X-Originating-IP: 173.228.112.64 Received: from sigabrt.gateway.sonic.net (173-228-112-64.dsl.dynamic.fusionbroadband.com [173.228.112.64]) (Authenticated sender: blp@ovn.org) by relay2-d.mail.gandi.net (Postfix) with ESMTPSA id F06FCC5A49; Mon, 1 Jan 2018 06:17:08 +0100 (CET) From: Ben Pfaff To: dev@openvswitch.org Date: Sun, 31 Dec 2017 21:16:40 -0800 Message-Id: <20180101051640.13043-15-blp@ovn.org> X-Mailer: git-send-email 2.10.2 In-Reply-To: <20180101051640.13043-1-blp@ovn.org> References: <20180101051640.13043-1-blp@ovn.org> MIME-Version: 1.0 Cc: Ben Pfaff Subject: [ovs-dev] [PATCH 15/15] ovsdb: Introduce experimental support for clustered databases. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org This commit adds support for OVSDB clustering via Raft. Please read ovsdb(7) for information on how to set up a clustered database. It is simple and boils down to running "ovsdb-tool create-cluster" on one server and "ovsdb-tool join-cluster" on each of the others and then starting ovsdb-server in the usual way on all of them. One you have a clustered database, you configure ovn-controller and ovn-northd to use it by pointing them to all of the servers, e.g. where previously you might have said "tcp:1.2.3.4" was the database server, now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12". This adds support for database clustering to ovs-sandbox and ovn-ctl also. Numan Siddique contributed the clustering support for ovn-ctl. Signed-off-by: Ben Pfaff Co-authored-by: Numan Siddique Signed-off-by: Numan Siddique --- Documentation/ref/ovsdb.5.rst | 208 +- Documentation/ref/ovsdb.7.rst | 229 ++- NEWS | 17 +- lib/.gitignore | 3 + lib/automake.mk | 10 + lib/jsonrpc.c | 29 + lib/jsonrpc.h | 6 + lib/ovsdb-idl.c | 603 ++++-- lib/ovsdb-idl.h | 2 + lib/ovsdb-server-idl.ann | 9 + lib/ovsdb-session.c | 76 + lib/ovsdb-session.h | 25 + lib/uuid.h | 12 + ovn/controller/ovn-controller.c | 1 + ovn/utilities/ovn-ctl | 123 +- ovn/utilities/ovn-nbctl.8.xml | 15 + ovn/utilities/ovn-nbctl.c | 9 + ovn/utilities/ovn-sbctl.8.in | 13 + ovn/utilities/ovn-sbctl.c | 9 + ovsdb/TODO.rst | 61 + ovsdb/_server.ovsschema | 18 +- ovsdb/_server.xml | 71 +- ovsdb/automake.mk | 10 + ovsdb/execution.c | 95 +- ovsdb/file.c | 767 ++----- ovsdb/file.h | 40 +- ovsdb/jsonrpc-server.c | 69 +- ovsdb/jsonrpc-server.h | 5 +- ovsdb/log.c | 8 + ovsdb/log.h | 2 + ovsdb/ovsdb-client.1.in | 45 +- ovsdb/ovsdb-client.c | 594 +++++- ovsdb/ovsdb-server.1.in | 78 +- ovsdb/ovsdb-server.c | 356 +++- ovsdb/ovsdb-tool.1.in | 145 +- ovsdb/ovsdb-tool.c | 992 ++++++++- ovsdb/ovsdb-util.c | 11 +- ovsdb/ovsdb-util.h | 4 + ovsdb/ovsdb.c | 174 +- ovsdb/ovsdb.h | 31 +- ovsdb/raft-private.c | 735 +++++++ ovsdb/raft-private.h | 208 ++ ovsdb/raft-rpc.c | 1022 +++++++++ ovsdb/raft-rpc.h | 292 +++ ovsdb/raft.c | 4321 +++++++++++++++++++++++++++++++++++++++ ovsdb/raft.h | 181 ++ ovsdb/replication.c | 6 +- ovsdb/row.c | 3 + ovsdb/server.c | 14 +- ovsdb/server.h | 2 +- ovsdb/storage.c | 574 ++++++ ovsdb/storage.h | 95 + ovsdb/transaction.c | 210 +- ovsdb/transaction.h | 19 +- ovsdb/trigger.c | 277 ++- ovsdb/trigger.h | 27 +- tests/.gitignore | 1 + tests/automake.mk | 1 + tests/ovs-macros.at | 29 + tests/ovsdb-cluster.at | 281 +++ tests/ovsdb-idl.at | 2 +- tests/ovsdb-monitor.at | 87 +- tests/ovsdb-server.at | 630 +++--- tests/ovsdb-tool.at | 118 +- tests/ovsdb.at | 1 + tests/test-ovsdb.c | 9 +- tutorial/ovs-sandbox | 165 +- utilities/ovs-lib.in | 47 + 68 files changed, 12588 insertions(+), 1744 deletions(-) create mode 100644 lib/ovsdb-server-idl.ann create mode 100644 lib/ovsdb-session.c create mode 100644 lib/ovsdb-session.h create mode 100644 ovsdb/TODO.rst create mode 100644 ovsdb/raft-private.c create mode 100644 ovsdb/raft-private.h create mode 100644 ovsdb/raft-rpc.c create mode 100644 ovsdb/raft-rpc.h create mode 100644 ovsdb/raft.c create mode 100644 ovsdb/raft.h create mode 100644 ovsdb/storage.c create mode 100644 ovsdb/storage.h create mode 100644 tests/ovsdb-cluster.at diff --git a/Documentation/ref/ovsdb.5.rst b/Documentation/ref/ovsdb.5.rst index f3e50976b5c7..0ab888996eac 100644 --- a/Documentation/ref/ovsdb.5.rst +++ b/Documentation/ref/ovsdb.5.rst @@ -30,9 +30,11 @@ ovsdb Description =========== -OVSDB, the Open vSwitch Database, is a database system whose network -protocol is specified by RFC 7047. The RFC does not specify an on-disk -storage format. This manpage documents the format used by Open vSwitch. +OVSDB, the Open vSwitch Database, is a database system whose network protocol +is specified by RFC 7047. The RFC does not specify an on-disk storage format. +The OVSDB implementation in Open vSwitch implements two storage formats: one +for standalone (and active-backup) databases, and the other for clustered +databases. This manpage documents both of these formats. Most users do not need to be concerned with this specification. Instead, to manipulate OVSDB files, refer to `ovsdb-tool(1)`. For an @@ -47,14 +49,16 @@ infer it. OVSDB files do not include the values of ephemeral columns. -Database files are text files encoded in UTF-8 with LF (U+000A) line ends, -organized as append-only series of records. Each record consists of 2 -lines of text. +Standalone and clustered database files share the common structure described +here. They are text files encoded in UTF-8 with LF (U+000A) line ends, +organized as append-only series of records. Each record consists of 2 lines of +text. -The first line in each record has the format ``OVSDB JSON`` *length* *hash*, -where *length* is a positive decimal integer and *hash* is a SHA-1 checksum -expressed as 40 hexadecimal digits. Words in the first line must be separated -by exactly one space. +The first line in each record has the format ``OVSDB ``, +where is ``JSON`` for standalone databases or ``CLUSTER`` for clustered +databases, is a positive decimal integer, and is a SHA-1 +checksum expressed as 40 hexadecimal digits. Words in the first line must be +separated by exactly one space. The second line must be exactly *length* bytes long (including the LF) and its SHA-1 checksum (including the LF) must match *hash* exactly. The line's @@ -102,8 +106,7 @@ looking through a database log with ``ovsdb-tool show-log``: operations, OVSDB concatenates them into a single ``_comment`` member, separated by a new-line. - OVSDB only writes a ``_comment`` member if it would be - a nonempty string. + OVSDB only writes a ``_comment`` member if it would be a nonempty string. Each of these records also has one or more additional members, each of which maps from the name of a database table to a : @@ -123,3 +126,184 @@ maps from the name of a database table to a : default values for their types defined in RFC 7047 section 5.2.1; for modified rows, the OVSDB implementation omits columns whose values are unchanged. + +Clustered Format +---------------- + +The clustered format has the following additional notation: + + + A JSON integer that represents a 64-bit unsigned integer. The OVS JSON + implementation only supports integers in the range -2**63 through 2**63-1, + so 64-bit unsigned integer values from 2**63 through 2**64-1 are expressed + as negative numbers. + +

+ A JSON string that represents a network address to support clustering, in + the ``::`` syntax described in ``ovsdb-tool(1)``. + + + A JSON object whose names are s that identify servers and + whose values are
es that specify those servers' addresses. + + + A JSON array with two elements: + + 1. The first element is either a or ``null``. It is + always present in the first record of a clustered database to indicate + the database's initial schema. If it is present in a later record, it + indicates a change of schema for the database. + + 2. The second element is either a transaction record in the format + described under ``Transaction Records'' above, or ``null``. + + When a schema is present, the transaction record is relative to an empty + database. That is, a schema change effectively resets the database to + empty and the transaction record represents the full database contents. + This allows readers to be ignorant of the full semantics of schema change. + +The first record in a clustered database contains the following members, +all of which are required: + +``"server_id": `` + The server's own UUID, which must be unique within the cluster. + +``"local_address":
`` + The address on which the server listens for connections from other + servers in the cluster. + +``name": `` + The database schema name. It is only important when a server is in the + process of a joining a cluster: a server will only join a cluster if the + name matches. (If the database schema name were unique, then we would + not also need a cluster ID.) + +``"cluster_id": `` + The cluster's UUID. The all-zeros UUID is not a valid cluster ID. + +``"prev_term": `` and ``"prev_index": `` + The Raft term and index just before the beginning of the log. + +``"prev_servers": `` + The set of one or more servers in the cluster at index "prev_index" and + term "prev_term". It might not include this server, if it was not the + initial server in the cluster. + +``"prev_data": `` and ``"prev_eid": `` + A snapshot of the data in the database at index "prev_index" and term + "prev_term", and the entry ID for that data. The snapshot must contain a + schema. + +The second and subsequent records, if present, in a clustered database +represent changes to the database, to the cluster state, or both. There are +several types of these records. The most important types of records directly +represent persistent state described in the Raft specification: + +Entry + A Raft log entry. + +Term + The start of a new term. + +Vote + The server's vote for a leader in the current term. + +The following additional types of records aid debugging and troubleshooting, +but they do not affect correctness. + +Note + A human-readable description of some event. + +Commit Index + An update to the server's ``commit_index``. + +Leader + Identifies a newly elected leader for the current term. + +The table below identifies the members that each type of record contains. +"yes" indicates that a member is required, "?" that it is optional, blank that +it is forbidden, and [1] that ``data`` and ``eid`` must be either both present +or both absent. + +============ ==== ===== ==== ====== ============ ==== +member Term Entry Vote Leader Commit Index Note +============ ==== ===== ==== ====== ============ ==== +comment ? ? ? ? ? ? +term yes yes yes yes +index yes +servers ? +data [1] +eid [1] +vote yes +leader yes +commit_index yes +note yes +============ ==== ===== ==== ====== ============ ==== + +The members are: + +``"comment": `` + A human-readable string giving an administrator more information about + the reason a record was emitted. + +``"term": `` + The term in which the activity occurred. + +``"index": `` + The index of a log entry. + +``"servers": `` + Server configuration in a log entry. + +``"data": `` + The data in a log entry. + +``"eid": `` + Entry ID in a log entry. + +``"vote": `` + The server ID for which this server voted. + +``"leader": `` + The server ID of the server. Emitted by both leaders and followers when a + leader is elected. + +``"commit_index": `` + Updated ``commit_index`` value. + +``"note": `` + One of a few special strings indicating important events. The currently + defined strings are: + + ``"transfer leadership"`` + This server transferred leadership to a different server (with details + included in ``comment``). + + ``"left"`` + This server finished leaving the cluster. (This lets subsequent + readers know that the server is not part of the cluster and should not + attempt to connect to it.) + +Joining a Cluster +~~~~~~~~~~~~~~~~~ + +In addition to general format for a clustered database, there is also a special +case for a database file created by ``ovsdb-tool join-cluster``. Such a file +contains exactly one record, which conveys the information passed to the +``join-cluster`` command. It has the following members: + +``"server_id": `` and ``"local_address":
`` and ``"name": `` + These have the same semantics described above in the general description + of the format. + +``"cluster_id": `` + This is provided only if the user gave the ``--cid`` option to + ``join-cluster``. It has the same semantics described above. + + +``"remote_addresses"; [
*]`` + One or more remote servers to contact for joining the cluster. + +When the server successfully joins the cluster, the database file is replaced +by one in the general format described earlier. + diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 6adef73826e8..75c2542cdffc 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -123,9 +123,13 @@ schema checksum from a schema or database file, respectively. Service Models ============== -OVSDB supports two service models for databases: **standalone**, and -**active-backup**. The service models provide different compromises -among consistency and availability. +OVSDB supports three service models for databases: **standalone**, +**active-backup**, and **clustered**. The service models provide different +compromises among consistency, availability, and partition tolerance. They +also differ in the number of servers required and in terms of performance. The +standalone and active-backup database service models share one on-disk format, +and clustered databases use a different format, but the OVSDB programs work +with both formats. ``ovsdb(5)`` documents these file formats. RFC 7047, which specifies the OVSDB protocol, does not mandate or specify any particular service model. @@ -147,6 +151,11 @@ To set up a standalone database, use ``ovsdb-tool create`` to create a database file, then run ``ovsdb-server`` to start the database service. +To configure a client, such as ``ovs-vswitchd`` or ``ovs-vsctl``, to use a +clustered database, configure the server to listen on a "connection method" +that the client can reach, then point the client to that connection method. +See `Connection Methods`_ below for information about connection methods. + Active-Backup Database Service Model ------------------------------------ @@ -189,10 +198,149 @@ for server pairs. Compared to a standalone server, the active-backup service model somewhat increases availability, at a risk of split-brain. It adds -generally insignificant performance overhead. +generally insignificant performance overhead. On the other hand, the +clustered service model, discussed below, requires at least 3 servers +and has greater performance overhead, but it avoids the need for +external management software and eliminates the possibility of +split-brain. Open vSwitch 2.6 introduced support for the active-backup service model. +Clustered Database Service Model +-------------------------------- + +A **clustered** database runs across 3 or 5 database servers (the **cluster**) +on different hosts. Servers in a cluster automatically synchronize writes +within the cluster. A 3-server cluster can remain available in the face of at +most 1 server failure; a 5-server cluster tolerates up to 2 failures. +Clusters larger than 5 servers will also work, with every 2 added servers +allowing the cluster to tolerate 1 more failure, but write performance +decreases. The number of servers should be odd: a 4- or 6-server cluster +cannot tolerate more failures than a 3- or 5-server cluster, respectively. + +To set up a clustered database, first initialize it on a single node by running +``ovsdb-tool create-cluster``, then start ``ovsdb-server``. Depending on its +arguments, the ``create-cluster`` command can create an empty database or copy +a standalone database's contents into the new database. + +To configure a client, such as ``ovn-controller`` or ``ovn-sbctl``, to use a +clustered database, first configure all of the servers to listen on a +connection method that the client can reach, then point the client to all of +the connection methods, comma-separated. See `Connection Methods`_, below, for +more detail. + +Open vSwitch 2.9 introduced support for the clustered service model. + +How to Maintain a Clustered Database +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To add a server to a cluster, run ``ovsdb-tool join-cluster`` on the new server +and start ``ovsdb-server``. To remove a running server from a cluster, use +``ovs-appctl`` to invoke the ``cluster/leave`` command. When a server fails +and cannot be recovered, e.g. because its hard disk crashed, or to otherwise +remove a server that is down from a cluster, use ``ovs-appctl`` to invoke +``cluster/kick`` to make the remaining servers kick it out of the cluster. + +The above methods for adding and removing servers only work for healthy +clusters, that is, for clusters with no more failures than their maximum +tolerance. For example, in a 3-server cluster, the failure of 2 servers +prevents servers joining or leaving the cluster (as well as database access). +To prevent data loss or inconsistency, the preferred solution to this problem +is to bring up enough of the failed servers to make the cluster healthy again, +then if necessary remove any remaining failed servers and add new ones. If +this cannot be done, though, use ``ovs-appctl`` to invoke ``cluster/leave +--force`` on a running server. This command forces the server to which it is +directed to leave its cluster and form a new single-node cluster that contains +only itself. The data in the new cluster may be inconsistent with the former +cluster: transactions not yet replicated to the server will be lost, and +transactions not yet applied to the cluster may be committed. Afterward, any +servers in its former cluster will regard the server to have failed. + +The servers in a cluster synchronize data over a cluster management protocol +that is specific to Open vSwitch; it is not the same as the OVSDB protocol +specified in RFC 7047. For this purpose, a server in a cluster is tied to a +particular IP address and TCP port, which is specified in the ``ovsdb-tool`` +command that creates or joins the cluster. The TCP port used for clustering +must be different from that used for OVSDB clients. To change the port or +address of a server in a cluster, first remove it from the cluster, then add it +back with the new address. + +To upgrade the ``ovsdb-server`` processes in a cluster from one version of Open +vSwitch to another, upgrading them one at a time will keep the cluster healthy +during the upgrade process. (This is different from upgrading a database +schema, which is covered later under `Upgrading or Downgrading a Database`_.) + +Clustered OVSDB does not support the OVSDB "ephemeral columns" feature. +``ovsdb-tool`` and ``ovsdb-client`` change ephemeral columns into persistent +ones when they work with schemas for clustered databases. Future versions of +OVSDB might add support for this feature. + +Understanding Cluster Consistency +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To ensure consistency, clustered OVSDB uses the Raft algorithm described in +Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and Practice". In an +operational Raft cluster, at any given time a single server is the "leader" and +the other nodes are "followers". Only the leader processes transactions, but a +transaction is only committed when a majority of the servers confirm to the +leader that they have written it to persistent storage. + +In most database systems, read and write access to the database happens through +transactions. In such a system, Raft allows a cluster to present a strongly +consistent transactional interface. OVSDB uses conventional transactions for +writes, but clients often effectively do reads a different way, by asking the +server to "monitor" a database or a subset of one on the client's behalf. +Whenever monitored data changes, the server automatically tells the client what +changed, which allows the client to maintain an accurate snapshot of the +database in its memory. Of course, at any given time, the snapshot may be +somewhat dated since some of it could have changed without the change +notification yet being received and processed by the client. + +Given this unconventional usage model, OVSDB also adopts an unconventional +clustering model. Each server in a cluster acts independently for the purpose +of monitors and read-only transactions, without verifying that data is +up-to-date with the leader. Servers forward transactions that write to the +database to the leader for execution, ensuring consistency. This has the +following consequences: + +* Transactions that involve writes, against any server in the cluster, are + linearizable if clients take care to use correct prerequisites, which is the + same condition required for linearizability in a standalone OVSDB. + (Actually, "at-least-once" consistency, because OVSDB does not have a session + mechanism to drop duplicate transactions if a connection drops after the + server commits it but before the client receives the result.) + +* Read-only transactions can yield results based on a stale version of the + database, if they are executed against a follower. Transactions on the + leader always yield fresh results. (With monitors, as explained above, a + client can always see stale data even without clustering, so clustering does + not change the consistency model for monitors.) + +* Monitor-based (or read-heavy) workloads scale well across a cluster, because + clustering OVSDB adds no additional work or communication for reads and + monitors. + +* A write-heavy client should connect to the leader, to avoid the overhead of + followers forwarding transactions to the leader. + +* When a client conducts a mix of read and write transactions across more than + one server in a cluster, it can see inconsistent results because a read + transaction might read stale data whose updates have not yet propagated from + the leader. By default, ``ovn-sbctl`` and similar utilities connect to the + cluster leader to avoid this issue. + + The same might occur for transactions against a single follower except that + the OVSDB server ensures that the results of a write forwarded to the leader + by a given server are visible at that server before it replies to the + requesting client. + +* A client uses a database on one server in a cluster, then another server in + the cluster (perhaps because the first server failed) could observe stale + data. Clustered OVSDB clients, however, can use a column in the ``_Server`` + database to detect that data on a server is older than data that the client + previously read. The OVSDB client library in Open vSwitch uses this feature + to avoid servers with stale data. + Database Replication ==================== @@ -245,6 +393,18 @@ unix: On Windows, connect to a local named pipe that is represented by a file created in the path to mimic the behavior of a Unix domain socket. +,,..., + For a clustered database service to be highly available, a client must be + able to connect to any of the servers in the cluster. To do so, specify + connection methods for each of the servers separated by commas (and + optional spaces). + + In theory, if machines go up and down and IP addresses change in the right + way, a client could talk to the wrong instance of a database. To avoid + this possibility, add ``cid:`` to the list of methods, where + is the cluster ID of the desired database cluster, as printed by + ``ovsdb-tool get-cid``. This feature is optional. + OVSDB supports the following passive connection methods: pssl:[:] @@ -314,27 +474,42 @@ A more common backup strategy is to periodically take and store a snapshot. For the standalone and active-backup service models, making a copy of the database file, e.g. using ``cp``, effectively makes a snapshot, and because OVSDB database files are append-only, it works even if the database is being -modified when the snapshot takes place. +modified when the snapshot takes place. This approach does not work for +clustered databases. -Another way to make a backup is to use ``ovsdb-client backup``, which -connects to a running database server and outputs an atomic snapshot of its -schema and content, in the same format used for on-disk databases. +Another way to make a backup, which works with all OVSDB service models, is to +use ``ovsdb-client backup``, which connects to a running database server and +outputs an atomic snapshot of its schema and content, in the same format used +for standalone and active-backup databases. Multiple options are also available when the time comes to restore a database -from a backup. One option is to stop the database server or servers, overwrite -the database file with the backup (e.g. with ``cp``), and then restart the -servers. Another way is to use ``ovsdb-client restore``, which connects to a -running database server and replaces the data in one of its databases by a -provided snapshot. The advantage of ``ovsdb-client restore`` is that it causes -zero downtime for the database and its server. It has the downside that UUIDs -of rows in the restored database will differ from those in the snapshot, -because the OVSDB protocol does not allow clients to specify row UUIDs. +from a backup. For the standalone and active-backup service models, one option +is to stop the database server or servers, overwrite the database file with the +backup (e.g. with ``cp``), and then restart the servers. Another way, which +works with any service model, is to use ``ovsdb-client restore``, which +connects to a running database server and replaces the data in one of its +databases by a provided snapshot. The advantage of ``ovsdb-client restore`` is +that it causes zero downtime for the database and its server. It has the +downside that UUIDs of rows in the restored database will differ from those in +the snapshot, because the OVSDB protocol does not allow clients to specify row +UUIDs. None of these approaches saves and restores data in columns that the schema designates as ephemeral. This is by design: the designer of a schema only marks a column as ephemeral if it is acceptable for its data to be lost when a database server restarts. +Clustering and backup serve different purposes. Clustering increases +availability, but it does not protect against data loss if, for example, a +malicious or malfunctioning OVSDB client deletes or tampers with data. + +Changing Database Service Model +------------------------------- + +Use ``ovsdb-tool create-cluster`` to create a clustered database from the +contents of a standalone database. Use ``ovsdb-tool backup`` to create a +standalone database from the contents of a clustered database. + Upgrading or Downgrading a Database ----------------------------------- @@ -367,8 +542,8 @@ active-backup database, first stop the database server or servers, then use ``ovsdb-tool convert`` to convert it to the new schema, and then restart the database server. -OVSDB also supports online database schema conversion. -To convert a database online, use ``ovsdb-client convert``. +OVSDB also supports online database schema conversion, for any of its database +service models. To convert a database online, use ``ovsdb-client convert``. The conversion is atomic, consistent, isolated, and durable. ``ovsdb-server`` disconnects any clients connected when the conversion takes place (except clients that use the ``set_db_change_aware`` Open vSwitch extension RPC). Upon @@ -405,9 +580,9 @@ First, ``ovsdb-tool compact`` can compact a standalone or active-backup database that is not currently being served by ``ovsdb-server`` (or otherwise locked for writing by another process). To compact any database that is currently being served by ``ovsdb-server``, use ``ovs-appctl`` to send the -``ovsdb-server/compact`` command. Each server in an active-backup database -maintains its database file independently, so to compact all of them, issue -this command separately on each server. +``ovsdb-server/compact`` command. Each server in an active-backup or clustered +database maintains its database file independently, so to compact all of them, +issue this command separately on each server. Viewing History --------------- @@ -421,8 +596,10 @@ client. The comments can be helpful for quickly understanding a transaction; for example, ``ovs-vsctl`` adds its command line to the transactions that it makes. -For active-backup databases, the sequence of transactions in each server's log -will differ, even at points when they reflect the same data. +The ``show-log`` command works with both OVSDB file formats, but the details of +the output format differ. For active-backup and clustered databases, the +sequence of transactions in each server's log will differ, even at points when +they reflect the same data. Truncating History ------------------ @@ -449,9 +626,9 @@ cryptography, it is acceptable for this purpose because it is not used to defend against malicious attackers. The first record in a standalone or active-backup database file specifies the -schema. ``ovsdb-server`` will refuse to work with a database whose first -record is corrupted. Delete and recreate such a database, or restore it from a -backup. +schema. ``ovsdb-server`` will refuse to work with a database where this record +is corrupted, or with a clustered database file with corruption in the first +few records. Delete and recreate such a database, or restore it from a backup. When ``ovsdb-server`` adds records to a database file in which it detected corruption, it first truncates the file just after the last good record. diff --git a/NEWS b/NEWS index 646879c61677..26d2bdb26763 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,7 @@ Post-v2.8.0 -------------------- - NSH implementation now conforms to latest draft (draft-ietf-sfc-nsh-28). - - OVSDB: + - OVSDB has new, experimental support for database clustering: * New high-level documentation in ovsdb(7). * New file format documentation for developers in ovsdb(5). * Protocol documentation moved from ovsdb-server(1) to ovsdb-server(7). @@ -9,10 +9,13 @@ Post-v2.8.0 "ovsdb-client convert". * ovsdb-server now always hosts a built-in database named _Server. See ovsdb-server(5) for more details. - * ovsdb-client: New "get-schema-cksum" and "query" commands. - * ovsdb-client: New "backup" and "restore" commands. - * ovsdb-client: New --timeout option. - * ovsdb-tool: New "db-name" and "schema-name" commands. + * ovsdb-client: New "get-schema-cksum", "query", "backup", "restore", + and "wait" commands. New --timeout option. + * ovsdb-tool: New "create-cluster", "join-cluster", "db-cid", "db-sid", + "db-local-address", "db-name", "schema-name", and "check-cluster" + commands. + * ovsdb-server: New ovs-appctl commands for managing clusters. + * ovs-sandbox: New support for clustered databases. - ovs-vsctl and other commands that display data in tables now support a --max-column-width option to limit column width. - OVN: @@ -24,7 +27,9 @@ Post-v2.8.0 - Added support to generate Neighbor Solicitation packets using the OVN action 'nd_ns' to resolve unknown next hop MAC addresses for the IPv6 packets. - * ovn-ctl: New commands run_nb_ovsdb and run_sb_ovsdb. + * ovn-ctl: New commands run_nb_ovsdb and run_sb_ovsdb. New support for + clustered databases. + * ovn-sbctl, ovn-nbctl: New options --leader-only, --no-leader-only. - Linux kernel 4.13 * Add support for compiling OVS with the latest Linux 4.13 kernel - "flush-conntrack" in ovs-dpctl and ovs-appctl now accept a 5-tuple to diff --git a/lib/.gitignore b/lib/.gitignore index 0680af657b37..7d7f4271b4f2 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -9,6 +9,9 @@ /ofp-actions.inc2 /ofp-errors.inc /ofp-msgs.inc +/ovsdb-server-idl.c +/ovsdb-server-idl.h +/ovsdb-server-idl.ovsidl /ovs-fields.7 /stdio.h /string.h diff --git a/lib/automake.mk b/lib/automake.mk index effe5b5c2940..f8b8c08c33f1 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -197,6 +197,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/ovsdb-condition.c \ lib/ovsdb-parser.c \ lib/ovsdb-parser.h \ + lib/ovsdb-session.c \ + lib/ovsdb-session.h \ lib/ovsdb-types.c \ lib/ovsdb-types.h \ lib/packets.c \ @@ -325,6 +327,8 @@ EXTRA_DIST += \ nodist_lib_libopenvswitch_la_SOURCES = \ lib/dirs.c \ + lib/ovsdb-server-idl.c \ + lib/ovsdb-server-idl.h \ lib/vswitch-idl.c \ lib/vswitch-idl.h CLEANFILES += $(nodist_lib_libopenvswitch_la_SOURCES) @@ -541,6 +545,12 @@ lib/ofp-msgs.lo: lib/ofp-msgs.inc CLEANFILES += lib/ofp-msgs.inc EXTRA_DIST += build-aux/extract-ofp-msgs +# _server IDL +OVSIDL_BUILT += lib/ovsdb-server-idl.c lib/ovsdb-server-idl.h lib/ovsdb-server-idl.ovsidl +EXTRA_DIST += lib/ovsdb-server-idl.ann +lib/ovsdb-server-idl.ovsidl: ovsdb/_server.ovsschema lib/ovsdb-server-idl.ann + $(AM_V_GEN)$(OVSDB_IDLC) annotate $(srcdir)/ovsdb/_server.ovsschema $(srcdir)/lib/ovsdb-server-idl.ann > $@.tmp && mv $@.tmp $@ + INSTALL_DATA_LOCAL += lib-install-data-local lib-install-data-local: $(MKDIR_P) $(DESTDIR)$(PKIDIR) diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index f8786f909ac8..0b8c1468b8d1 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -563,6 +563,16 @@ jsonrpc_create_error(struct json *error, const struct json *id) json_clone(id)); } +struct jsonrpc_msg * +jsonrpc_msg_clone(const struct jsonrpc_msg *old) +{ + return jsonrpc_create(old->type, old->method, + json_nullable_clone(old->params), + json_nullable_clone(old->result), + json_nullable_clone(old->error), + json_nullable_clone(old->id)); +} + const char * jsonrpc_msg_type_to_string(enum jsonrpc_msg_type type) { @@ -754,6 +764,16 @@ jsonrpc_msg_to_json(struct jsonrpc_msg *m) return json; } + +char * +jsonrpc_msg_to_string(const struct jsonrpc_msg *m) +{ + struct jsonrpc_msg *copy = jsonrpc_msg_clone(m); + struct json *json = jsonrpc_msg_to_json(copy); + char *s = json_to_string(json, JSSF_SORT); + json_destroy(json); + return s; +} /* A JSON-RPC session with reconnection. */ @@ -878,6 +898,15 @@ jsonrpc_session_close(struct jsonrpc_session *s) } } +struct jsonrpc * +jsonrpc_session_steal(struct jsonrpc_session *s) +{ + struct jsonrpc *rpc = s->rpc; + s->rpc = NULL; + jsonrpc_session_close(s); + return rpc; +} + static void jsonrpc_session_disconnect(struct jsonrpc_session *s) { diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h index 969a6ed38cd6..a44114e8dcd9 100644 --- a/lib/jsonrpc.h +++ b/lib/jsonrpc.h @@ -90,12 +90,16 @@ struct jsonrpc_msg *jsonrpc_create_reply(struct json *result, struct jsonrpc_msg *jsonrpc_create_error(struct json *error, const struct json *id); +struct jsonrpc_msg *jsonrpc_msg_clone(const struct jsonrpc_msg *); + const char *jsonrpc_msg_type_to_string(enum jsonrpc_msg_type); char *jsonrpc_msg_is_valid(const struct jsonrpc_msg *); void jsonrpc_msg_destroy(struct jsonrpc_msg *); char *jsonrpc_msg_from_json(struct json *, struct jsonrpc_msg **); struct json *jsonrpc_msg_to_json(struct jsonrpc_msg *); + +char *jsonrpc_msg_to_string(const struct jsonrpc_msg *); /* A JSON-RPC session with reconnection. */ @@ -106,6 +110,8 @@ struct jsonrpc_session *jsonrpc_session_open_unreliably(struct jsonrpc *, uint8_t); void jsonrpc_session_close(struct jsonrpc_session *); +struct jsonrpc *jsonrpc_session_steal(struct jsonrpc_session *); + void jsonrpc_session_run(struct jsonrpc_session *); void jsonrpc_session_wait(struct jsonrpc_session *); diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index 24ba5b50fddc..c88d93b14a61 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -36,10 +36,13 @@ #include "ovsdb-error.h" #include "ovsdb-idl-provider.h" #include "ovsdb-parser.h" +#include "ovsdb-server-idl.h" +#include "ovsdb-session.h" #include "openvswitch/poll-loop.h" #include "openvswitch/shash.h" #include "skiplist.h" #include "sset.h" +#include "svec.h" #include "util.h" #include "uuid.h" #include "openvswitch/vlog.h" @@ -81,40 +84,93 @@ struct ovsdb_idl_arc { /* Connection state machine. * - * When a JSON-RPC session connects, the IDL sends a "get_schema" request and - * transitions to IDL_S_SCHEMA_REQUESTED. If the session drops and reconnects, - * the IDL starts over again in the same way. */ + * When a JSON-RPC session connects, the IDL sends a "monitor_cond" request for + * the Database table in the _Server database and transitions to the + * IDL_S_SERVER_MONITOR_COND_REQUESTED state. If the session drops and + * reconnects, or if the IDL receives a "monitor_canceled" notification for a + * table it is monitoring, the IDL starts over again in the same way. */ +#define OVSDB_IDL_STATES \ + /* Waits for "get_schema" reply, then sends "monitor_cond" \ + * request for the Database table in the _Server database, whose \ + * details are informed by the schema, and transitions to \ + * IDL_S_SERVER_MONITOR_COND_REQUESTED. */ \ + OVSDB_IDL_STATE(SERVER_SCHEMA_REQUESTED) \ + \ + /* Waits for "monitor_cond" reply for the Database table: \ + * \ + * - If the reply indicates success, and the Database table has a \ + * row for the IDL database: \ + * \ + * * If the row indicates that this is a clustered database \ + * that is not connected to the cluster, closes the \ + * connection. The next connection attempt has a chance at \ + * picking a connected server. \ + * \ + * * Otherwise, sends a "monitor_cond" request for the IDL \ + * database whose details are informed by the schema \ + * (obtained from the row), and transitions to \ + * IDL_S_DATA_MONITOR_COND_REQUESTED. \ + * \ + * - If the reply indicates success, but the Database table does \ + * not have a row for the IDL database, transitions to \ + * IDL_S_ERROR. \ + * \ + * - If the reply indicates failure, sends a "get_schema" request \ + * for the IDL database and transitions to \ + * IDL_S_DATA_SCHEMA_REQUESTED. */ \ + OVSDB_IDL_STATE(SERVER_MONITOR_COND_REQUESTED) \ + \ + /* Waits for "get_schema" reply, then sends "monitor_cond" \ + * request whose details are informed by the schema, and \ + * transitions to IDL_S_DATA_MONITOR_COND_REQUESTED. */ \ + OVSDB_IDL_STATE(DATA_SCHEMA_REQUESTED) \ + \ + /* Waits for "monitor_cond" reply. If successful, replaces the \ + * IDL contents by the data carried in the reply and transitions \ + * to IDL_S_MONITORING. On failure, sends a "monitor" request \ + * and transitions to IDL_S_DATA_MONITOR_REQUESTED. */ \ + OVSDB_IDL_STATE(DATA_MONITOR_COND_REQUESTED) \ + \ + /* Waits for "monitor" reply. If successful, replaces the IDL \ + * contents by the data carried in the reply and transitions to \ + * IDL_S_MONITORING. On failure, transitions to IDL_S_ERROR. */ \ + OVSDB_IDL_STATE(DATA_MONITOR_REQUESTED) \ + \ + /* State that processes "update" or "update2" notifications for \ + * the main database (and the Database table in _Server if \ + * available). \ + * \ + * If we're monitoring the Database table and we get notified \ + * that the IDL database has been deleted, we close the \ + * connection (which will restart the state machine). */ \ + OVSDB_IDL_STATE(MONITORING) \ + \ + /* Terminal error state that indicates that nothing useful can be \ + * done, for example because the database server doesn't actually \ + * have the desired database. We maintain the session with the \ + * database server anyway. If it starts serving the database \ + * that we want, or if someone fixes and restarts the database, \ + * then it will kill the session and we will automatically \ + * reconnect and try again. */ \ + OVSDB_IDL_STATE(ERROR) \ + \ + /* Terminal state that indicates we connected to a useless server \ + * in a cluster, e.g. one that is partitioned from the rest of \ + * the cluster. We're waiting to retry. */ \ + OVSDB_IDL_STATE(RETRY) + enum ovsdb_idl_state { - /* Waits for "get_schema" reply, then sends a "monitor_cond" request whose - * details are informed by the schema and transitions to - * IDL_S_MONITOR_COND_REQUESTED. */ - IDL_S_SCHEMA_REQUESTED, +#define OVSDB_IDL_STATE(NAME) IDL_S_##NAME, + OVSDB_IDL_STATES +#undef OVSDB_IDL_STATE +}; - /* Waits for "monitor_cond" reply: - * - * - If the reply indicates success, replaces the IDL contents by the - * data carried in the reply and transitions to IDL_S_MONITORING_COND. - * - * - If the reply indicates failure because the database is too old to - * support monitor_cond, sends a "monitor" request and transitions to - * IDl_S_MONITOR_REQUESTED. */ - IDL_S_MONITOR_COND_REQUESTED, - - /* Waits for "monitor" reply, then replaces the IDL contents by the data - * carried in the reply and transitions to IDL_S_MONITORING. */ - IDL_S_MONITOR_REQUESTED, - - /* Terminal states that process "update2" (IDL_S_MONITORING_COND) or - * "update" (IDL_S_MONITORING) notifications. */ - IDL_S_MONITORING_COND, - IDL_S_MONITORING, - - /* Terminal error state that indicates that nothing useful can be done. - * The most likely reason is that the database server doesn't have the - * desired database. We maintain the session with the database server - * anyway. If it starts serving the database that we want, then it will - * kill the session and we will automatically reconnect and try again. */ - IDL_S_NO_SCHEMA +static const char *ovsdb_idl_state_to_string(enum ovsdb_idl_state); + +enum ovsdb_idl_monitoring { + OVSDB_IDL_NOT_MONITORING, /* Database is not being monitored. */ + OVSDB_IDL_MONITORING, /* Database has "monitor" outstanding. */ + OVSDB_IDL_MONITORING_COND, /* Database has "monitor_cond" outstanding. */ }; struct ovsdb_idl_db { @@ -130,6 +186,7 @@ struct ovsdb_idl_db { struct hmap outstanding_txns; bool verify_write_only; struct json *schema; + enum ovsdb_idl_monitoring monitoring; /* True if any of the tables' monitoring conditions has changed. */ bool cond_changed; @@ -159,6 +216,8 @@ static unsigned int ovsdb_idl_db_set_condition( static void ovsdb_idl_send_schema_request(struct ovsdb_idl *, struct ovsdb_idl_db *); +static void ovsdb_idl_send_db_change_aware(struct ovsdb_idl *); +static bool ovsdb_idl_check_server_db(struct ovsdb_idl *); static void ovsdb_idl_send_monitor_request(struct ovsdb_idl *, struct ovsdb_idl_db *, bool use_monitor_cond); @@ -178,9 +237,20 @@ struct ovsdb_idl { unsigned int state_seqno; /* See above. */ struct json *request_id; /* JSON ID for request awaiting reply. */ - bool use_monitor_cond; + struct uuid cid; + + uint64_t min_index; + bool leader_only; }; +static void ovsdb_idl_transition_at(struct ovsdb_idl *, enum ovsdb_idl_state, + const char *where); +#define ovsdb_idl_transition(IDL, STATE) \ + ovsdb_idl_transition_at(IDL, STATE, OVS_SOURCE_LOCATOR) + +static void ovsdb_idl_retry_at(struct ovsdb_idl *, const char *where); +#define ovsdb_idl_retry(IDL) ovsdb_idl_retry_at(IDL, OVS_SOURCE_LOCATOR) + struct ovsdb_idl_txn { struct hmap_node hmap_node; struct json *request_id; @@ -220,6 +290,9 @@ static void ovsdb_idl_db_parse_monitor_reply(struct ovsdb_idl_db *, bool is_monitor_cond); static bool ovsdb_idl_db_parse_update_rpc(struct ovsdb_idl_db *, const struct jsonrpc_msg *); +static bool ovsdb_idl_handle_monitor_canceled(struct ovsdb_idl *, + struct ovsdb_idl_db *, + const struct jsonrpc_msg *); static void ovsdb_idl_db_parse_update(struct ovsdb_idl_db *, const struct json *table_updates, bool is_monitor_cond); @@ -296,6 +369,18 @@ static void ovsdb_idl_add_to_indexes(const struct ovsdb_idl_row *); static void ovsdb_idl_remove_from_indexes(const struct ovsdb_idl_row *); static void +ovsdb_idl_open_session(struct ovsdb_idl *idl, const char *remote, bool retry) +{ + ovs_assert(!idl->db.txn); + jsonrpc_session_close(idl->session); + + struct svec remotes = SVEC_EMPTY_INITIALIZER; + ovsdb_session_parse_remote(remote, &remotes, &idl->cid); + idl->session = jsonrpc_session_open_multiple(&remotes, retry); + svec_destroy(&remotes); +} + +static void ovsdb_idl_db_init(struct ovsdb_idl_db *db, const struct ovsdb_idl_class *class, struct ovsdb_idl *parent, bool monitor_everything_by_default) { @@ -365,10 +450,26 @@ ovsdb_idl_create(const char *remote, const struct ovsdb_idl_class *class, struct ovsdb_idl *idl; idl = xzalloc(sizeof *idl); + ovsdb_idl_db_init(&idl->server, &serverrec_idl_class, idl, true); ovsdb_idl_db_init(&idl->db, class, idl, monitor_everything_by_default); - idl->session = jsonrpc_session_open(remote, retry); + ovsdb_idl_open_session(idl, remote, retry); idl->state_seqno = UINT_MAX; idl->request_id = NULL; + idl->leader_only = true; + + /* Monitor the Database table in the _Server database. + * + * We monitor only the row for 'class', or the row that has the + * desired 'cid'. */ + struct ovsdb_idl_condition cond; + ovsdb_idl_condition_init(&cond); + if (!uuid_is_zero(&idl->cid)) { + serverrec_database_add_clause_cid(&cond, OVSDB_F_EQ, &idl->cid, 1); + } else { + serverrec_database_add_clause_name(&cond, OVSDB_F_EQ, class->database); + } + ovsdb_idl_db_set_condition(&idl->server, &serverrec_table_database, &cond); + ovsdb_idl_condition_destroy(&cond); return idl; } @@ -379,7 +480,7 @@ ovsdb_idl_set_remote(struct ovsdb_idl *idl, const char *remote, bool retry) { if (idl) { - idl->session = jsonrpc_session_open(remote, retry); + ovsdb_idl_open_session(idl, remote, retry); /* XXX update condition */ idl->state_seqno = UINT_MAX; } @@ -419,6 +520,15 @@ ovsdb_idl_destroy(struct ovsdb_idl *idl) } } +void +ovsdb_idl_set_leader_only(struct ovsdb_idl *idl, bool leader_only) +{ + idl->leader_only = leader_only; + if (leader_only && idl->server.monitoring) { + ovsdb_idl_check_server_db(idl); + } +} + static void ovsdb_idl_db_clear(struct ovsdb_idl_db *db) { @@ -465,6 +575,40 @@ ovsdb_idl_db_clear(struct ovsdb_idl_db *db) } } +static const char * +ovsdb_idl_state_to_string(enum ovsdb_idl_state state) +{ + switch (state) { +#define OVSDB_IDL_STATE(NAME) case IDL_S_##NAME: return #NAME; + OVSDB_IDL_STATES +#undef OVSDB_IDL_STATE + default: return ""; + } +} + +static void +ovsdb_idl_retry_at(struct ovsdb_idl *idl, const char *where) +{ + if (jsonrpc_session_get_n_remotes(idl->session) > 1) { + ovsdb_idl_force_reconnect(idl); + ovsdb_idl_transition_at(idl, IDL_S_RETRY, where); + } else { + ovsdb_idl_transition_at(idl, IDL_S_ERROR, where); + } +} + +static void +ovsdb_idl_transition_at(struct ovsdb_idl *idl, enum ovsdb_idl_state new_state, + const char *where) +{ + VLOG_DBG("%s: %s -> %s at %s", + jsonrpc_session_get_name(idl->session), + ovsdb_idl_state_to_string(idl->state), + ovsdb_idl_state_to_string(new_state), + where); + idl->state = new_state; +} + static void ovsdb_idl_clear(struct ovsdb_idl *idl) { @@ -479,6 +623,165 @@ ovsdb_idl_send_request(struct ovsdb_idl *idl, struct jsonrpc_msg *request) jsonrpc_session_send(idl->session, request); } +static void +ovsdb_idl_restart_fsm(struct ovsdb_idl *idl) +{ + ovsdb_idl_send_schema_request(idl, &idl->server); + ovsdb_idl_transition(idl, IDL_S_SERVER_SCHEMA_REQUESTED); + idl->db.monitoring = OVSDB_IDL_NOT_MONITORING; + idl->server.monitoring = OVSDB_IDL_NOT_MONITORING; +} + +static void +ovsdb_idl_process_response(struct ovsdb_idl *idl, struct jsonrpc_msg *msg) +{ + bool ok = msg->type == JSONRPC_REPLY; + if (!ok + && idl->state != IDL_S_SERVER_SCHEMA_REQUESTED + && idl->state != IDL_S_SERVER_MONITOR_COND_REQUESTED + && idl->state != IDL_S_DATA_MONITOR_COND_REQUESTED) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + char *s = jsonrpc_msg_to_string(msg); + VLOG_INFO_RL(&rl, "%s: received unexpected %s response in " + "%s state: %s", jsonrpc_session_get_name(idl->session), + jsonrpc_msg_type_to_string(msg->type), + ovsdb_idl_state_to_string(idl->state), + s); + free(s); + ovsdb_idl_retry(idl); + return; + } + + switch (idl->state) { + case IDL_S_SERVER_SCHEMA_REQUESTED: + if (ok) { + json_destroy(idl->server.schema); + idl->server.schema = json_clone(msg->result); + ovsdb_idl_send_monitor_request(idl, &idl->server, true); + ovsdb_idl_transition(idl, IDL_S_SERVER_MONITOR_COND_REQUESTED); + } else { + ovsdb_idl_send_schema_request(idl, &idl->db); + ovsdb_idl_transition(idl, IDL_S_DATA_SCHEMA_REQUESTED); + } + break; + + case IDL_S_SERVER_MONITOR_COND_REQUESTED: + if (ok) { + idl->server.monitoring = OVSDB_IDL_MONITORING_COND; + ovsdb_idl_db_parse_monitor_reply(&idl->server, msg->result, true); + if (ovsdb_idl_check_server_db(idl)) { + ovsdb_idl_send_db_change_aware(idl); + } + } else { + ovsdb_idl_send_schema_request(idl, &idl->db); + ovsdb_idl_transition(idl, IDL_S_DATA_SCHEMA_REQUESTED); + } + break; + + case IDL_S_DATA_SCHEMA_REQUESTED: + json_destroy(idl->db.schema); + idl->db.schema = json_clone(msg->result); + ovsdb_idl_send_monitor_request(idl, &idl->db, true); + ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_COND_REQUESTED); + break; + + case IDL_S_DATA_MONITOR_COND_REQUESTED: + if (!ok) { + /* "monitor_cond" not supported. Try "monitor". */ + ovsdb_idl_send_monitor_request(idl, &idl->db, false); + ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_REQUESTED); + } else { + idl->db.monitoring = OVSDB_IDL_MONITORING_COND; + ovsdb_idl_transition(idl, IDL_S_MONITORING); + ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, true); + } + break; + + case IDL_S_DATA_MONITOR_REQUESTED: + idl->db.monitoring = OVSDB_IDL_MONITORING; + ovsdb_idl_transition(idl, IDL_S_MONITORING); + ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, false); + idl->db.change_seqno++; + ovsdb_idl_clear(idl); + ovsdb_idl_db_parse_update(&idl->db, msg->result, false); + break; + + case IDL_S_MONITORING: + /* We don't normally have a request outstanding in this state. If we + * do, it's a "monitor_cond_change", which means that the conditional + * monitor clauses were updated. + * + * If further condition changes were pending, send them now. */ + ovsdb_idl_send_cond_change(idl); + idl->db.cond_seqno++; + break; + + case IDL_S_ERROR: + case IDL_S_RETRY: + /* Nothing to do in this state. */ + break; + + default: + OVS_NOT_REACHED(); + } +} + +static void +ovsdb_idl_process_msg(struct ovsdb_idl *idl, struct jsonrpc_msg *msg) +{ + bool is_response = (msg->type == JSONRPC_REPLY || + msg->type == JSONRPC_ERROR); + + /* Process a reply to an outstanding request. */ + if (is_response + && idl->request_id && json_equal(idl->request_id, msg->id)) { + json_destroy(idl->request_id); + idl->request_id = NULL; + ovsdb_idl_process_response(idl, msg); + return; + } + + /* Process database contents updates. */ + if (ovsdb_idl_db_parse_update_rpc(&idl->db, msg)) { + return; + } + if (idl->server.monitoring + && ovsdb_idl_db_parse_update_rpc(&idl->server, msg)) { + ovsdb_idl_check_server_db(idl); + return; + } + + if (ovsdb_idl_handle_monitor_canceled(idl, &idl->db, msg) + || (idl->server.monitoring + && ovsdb_idl_handle_monitor_canceled(idl, &idl->server, msg))) { + return; + } + + /* Process "lock" replies and related notifications. */ + if (ovsdb_idl_db_process_lock_replies(&idl->db, msg)) { + return; + } + + /* Process response to a database transaction we submitted. */ + if (is_response && ovsdb_idl_db_txn_process_reply(&idl->db, msg)) { + return; + } + + /* Unknown message. Log at a low level because this can happen if + * ovsdb_idl_txn_destroy() is called to destroy a transaction + * before we receive the reply. + * + * (We could sort those out from other kinds of unknown messages by + * using distinctive IDs for transactions, if it seems valuable to + * do so, and then it would be possible to use different log + * levels. XXX?) */ + char *s = jsonrpc_msg_to_string(msg); + VLOG_DBG("%s: received unexpected %s message: %s", + jsonrpc_session_get_name(idl->session), + jsonrpc_msg_type_to_string(msg->type), s); + free(s); +} + /* Processes a batch of messages from the database server on 'idl'. This may * cause the IDL's contents to change. The client may check for that with * ovsdb_idl_get_seqno(). */ @@ -499,12 +802,9 @@ ovsdb_idl_run(struct ovsdb_idl *idl) seqno = jsonrpc_session_get_seqno(idl->session); if (idl->state_seqno != seqno) { idl->state_seqno = seqno; - json_destroy(idl->request_id); - idl->request_id = NULL; ovsdb_idl_txn_abort_all(idl); + ovsdb_idl_restart_fsm(idl); - ovsdb_idl_send_schema_request(idl, &idl->db); - idl->state = IDL_S_SCHEMA_REQUESTED; if (idl->db.lock_name) { jsonrpc_session_send( idl->session, ovsdb_idl_db_compose_lock_request(&idl->db)); @@ -515,98 +815,7 @@ ovsdb_idl_run(struct ovsdb_idl *idl) if (!msg) { break; } - - if (ovsdb_idl_db_parse_update_rpc(&idl->db, msg)) { - /* ovsdb_idl_db_parse_update_rpc() did all the processing. */ - } else if (msg->type == JSONRPC_REPLY - && idl->request_id - && json_equal(idl->request_id, msg->id)) { - json_destroy(idl->request_id); - idl->request_id = NULL; - - switch (idl->state) { - case IDL_S_SCHEMA_REQUESTED: - /* Reply to our "get_schema" request. */ - idl->db.schema = json_clone(msg->result); - ovsdb_idl_send_monitor_request(idl, &idl->db, true); - idl->state = IDL_S_MONITOR_COND_REQUESTED; - break; - - case IDL_S_MONITOR_REQUESTED: - case IDL_S_MONITOR_COND_REQUESTED: - /* Reply to our "monitor" or "monitor_cond" request. */ - if (idl->state == IDL_S_MONITOR_REQUESTED) { - idl->state = IDL_S_MONITORING; - ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, - false); - } else { /* IDL_S_MONITOR_COND_REQUESTED. */ - idl->state = IDL_S_MONITORING_COND; - ovsdb_idl_db_parse_monitor_reply(&idl->db, msg->result, - true); - } - - /* Schema is not useful after monitor request is accepted - * by the server. */ - json_destroy(idl->db.schema); - idl->db.schema = NULL; - break; - - case IDL_S_MONITORING_COND: - /* Conditional monitor clauses were updated. Send out - * the next condition changes, in any, immediately. */ - ovsdb_idl_send_cond_change(idl); - idl->db.cond_seqno++; - break; - - case IDL_S_MONITORING: - case IDL_S_NO_SCHEMA: - default: - OVS_NOT_REACHED(); - } - } else if (ovsdb_idl_db_process_lock_replies(&idl->db, msg)) { - /* ovsdb_idl_db_process_lock_replies() did all the processing. */ - } else if (msg->type == JSONRPC_ERROR - && idl->state == IDL_S_MONITOR_COND_REQUESTED - && idl->request_id - && json_equal(idl->request_id, msg->id)) { - if (msg->error && msg->error->type == JSON_STRING - && !strcmp(json_string(msg->error), "unknown method")) { - /* Fall back to using "monitor" method. */ - json_destroy(idl->request_id); - idl->request_id = NULL; - ovsdb_idl_send_monitor_request(idl, &idl->db, false); - idl->state = IDL_S_MONITOR_REQUESTED; - } - } else if (msg->type == JSONRPC_ERROR - && idl->state == IDL_S_MONITORING_COND - && idl->request_id - && json_equal(idl->request_id, msg->id)) { - json_destroy(idl->request_id); - idl->request_id = NULL; - VLOG_ERR("%s: conditional monitor update failed", - jsonrpc_session_get_name(idl->session)); - idl->state = IDL_S_NO_SCHEMA; - } else if (msg->type == JSONRPC_ERROR - && idl->state == IDL_S_SCHEMA_REQUESTED - && idl->request_id - && json_equal(idl->request_id, msg->id)) { - json_destroy(idl->request_id); - idl->request_id = NULL; - VLOG_ERR("%s: requested schema not found", - jsonrpc_session_get_name(idl->session)); - idl->state = IDL_S_NO_SCHEMA; - } else if ((msg->type == JSONRPC_ERROR - || msg->type == JSONRPC_REPLY) - && ovsdb_idl_db_txn_process_reply(&idl->db, msg)) { - /* ovsdb_idl_txn_process_reply() did everything needful. */ - } else { - /* This can happen if ovsdb_idl_txn_destroy() is called to destroy - * a transaction before we receive the reply, so keep the log level - * low. */ - VLOG_DBG("%s: received unexpected %s message", - jsonrpc_session_get_name(idl->session), - jsonrpc_msg_type_to_string(msg->type)); - } + ovsdb_idl_process_msg(idl, msg); jsonrpc_msg_destroy(msg); } ovsdb_idl_row_destroy_postprocess(&idl->db); @@ -710,7 +919,7 @@ bool ovsdb_idl_is_alive(const struct ovsdb_idl *idl) { return jsonrpc_session_is_alive(idl->session) && - idl->state != IDL_S_NO_SCHEMA; + idl->state != IDL_S_ERROR; } /* Returns the last error reported on a connection by 'idl'. The return value @@ -727,7 +936,7 @@ ovsdb_idl_get_last_error(const struct ovsdb_idl *idl) if (err) { return err; - } else if (idl->state == IDL_S_NO_SCHEMA) { + } else if (idl->state == IDL_S_ERROR) { return ENOENT; } else { return 0; @@ -1290,7 +1499,7 @@ ovsdb_idl_send_cond_change(struct ovsdb_idl *idl) * * XXX per-db request_id */ if (!jsonrpc_session_is_connected(idl->session) - || idl->state != IDL_S_MONITORING_COND + || idl->db.monitoring != OVSDB_IDL_MONITORING_COND || idl->request_id) { return; } @@ -1528,6 +1737,71 @@ ovsdb_idl_send_schema_request(struct ovsdb_idl *idl, db->class_->database)), NULL)); } + +static void +ovsdb_idl_send_db_change_aware(struct ovsdb_idl *idl) +{ + struct jsonrpc_msg *msg = jsonrpc_create_request( + "set_db_change_aware", json_array_create_1(json_boolean_create(true)), + NULL); + jsonrpc_session_send(idl->session, msg); +} + +static bool +ovsdb_idl_check_server_db(struct ovsdb_idl *idl) +{ + const struct serverrec_database *database; + SERVERREC_DATABASE_FOR_EACH (database, idl) { + if (uuid_is_zero(&idl->cid) + ? !strcmp(database->name, idl->db.class_->database) + : database->n_cid && uuid_equals(database->cid, &idl->cid)) { + break; + } + } + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + const char *server_name = jsonrpc_session_get_name(idl->session); + bool ok = false; + if (!database) { + VLOG_INFO_RL(&rl, "%s: server does not have %s database", + server_name, idl->db.class_->database); + } else if (!strcmp(database->model, "clustered") + && jsonrpc_session_get_n_remotes(idl->session) > 1) { + uint64_t index = database->n_index ? *database->index : 0; + + if (!database->schema) { + VLOG_INFO("%s: clustered database server has not yet joined " + "cluster; trying another server", server_name); + } else if (!database->connected) { + VLOG_INFO("%s: clustered database server is disconnected " + "from cluster; trying another server", server_name); + } else if (idl->leader_only && !database->leader) { + VLOG_INFO("%s: clustered database server is not cluster " + "leader; trying another server", server_name); + } else if (index < idl->min_index) { + VLOG_WARN("%s: clustered database server has stale data; " + "trying another server", server_name); + } else { + idl->min_index = MAX(idl->min_index, index); + ok = true; + } + } else { + ok = true; + } + if (!ok) { + ovsdb_idl_retry(idl); + return false; + } + + if (idl->state == IDL_S_SERVER_MONITOR_COND_REQUESTED) { + json_destroy(idl->db.schema); + idl->db.schema = json_from_string(database->schema); + ovsdb_idl_send_monitor_request(idl, &idl->db, true); + ovsdb_idl_transition(idl, IDL_S_DATA_MONITOR_COND_REQUESTED); + } + return true; +} + static void log_error(struct ovsdb_error *error) { @@ -1719,6 +1993,40 @@ ovsdb_idl_db_parse_update_rpc(struct ovsdb_idl_db *db, return false; } +static bool +ovsdb_idl_handle_monitor_canceled(struct ovsdb_idl *idl, + struct ovsdb_idl_db *db, + const struct jsonrpc_msg *msg) +{ + if (msg->type != JSONRPC_NOTIFY + || strcmp(msg->method, "monitor_canceled") + || msg->params->type != JSON_ARRAY + || msg->params->u.array.n != 1 + || !json_equal(msg->params->u.array.elems[0], db->monitor_id)) { + return false; + } + + db->monitoring = OVSDB_IDL_NOT_MONITORING; + + /* Cancel the other monitor and restart the FSM from the top. + * + * Maybe a more sophisticated response would be better in some cases, but + * it doesn't seem worth optimizing yet. (Although this is already more + * sophisticated than just dropping the connection and reconnecting.) */ + struct ovsdb_idl_db *other_db = db == &idl->db ? &idl->server : &idl->db; + if (other_db->monitoring) { + jsonrpc_session_send( + idl->session, + jsonrpc_create_request( + "monitor_cancel", + json_array_create_1(json_clone(other_db->monitor_id)), NULL)); + other_db->monitoring = OVSDB_IDL_NOT_MONITORING; + } + ovsdb_idl_restart_fsm(idl); + + return true; +} + static struct ovsdb_error * ovsdb_idl_db_parse_update__(struct ovsdb_idl_db *db, const struct json *table_updates, @@ -2851,7 +3159,14 @@ static struct ovsdb_idl_table * ovsdb_idl_table_from_class(const struct ovsdb_idl *idl, const struct ovsdb_idl_table_class *table_class) { - return ovsdb_idl_db_table_from_class(&idl->db, table_class); + struct ovsdb_idl_table *table; + + table = ovsdb_idl_db_table_from_class(&idl->db, table_class); + if (!table) { + table = ovsdb_idl_db_table_from_class(&idl->server, table_class); + } + + return table; } /* Called by ovsdb-idlc generated code. */ @@ -3867,7 +4182,7 @@ static void ovsdb_idl_txn_set_error_json(struct ovsdb_idl_txn *txn, const struct json *json) { - if (txn->error == NULL) { + if (json && txn->error == NULL) { txn->error = json_to_string(json, JSSF_SORT); } } @@ -4293,10 +4608,21 @@ ovsdb_idl_db_txn_process_reply(struct ovsdb_idl_db *db, } if (msg->type == JSONRPC_ERROR) { - status = TXN_ERROR; + if (msg->error + && msg->error->type == JSON_STRING + && !strcmp(json_string(msg->error), "canceled")) { + /* ovsdb-server uses this error message to indicate that the + * transaction was canceled because the database in question was + * removed, converted, etc. */ + status = TXN_TRY_AGAIN; + } else { + status = TXN_ERROR; + ovsdb_idl_txn_set_error_json(txn, msg->error); + } } else if (msg->result->type != JSON_ARRAY) { VLOG_WARN_RL(&syntax_rl, "reply to \"transact\" is not JSON array"); status = TXN_ERROR; + ovsdb_idl_txn_set_error_json(txn, msg->result); } else { struct json_array *ops = &msg->result->u.array; int hard_errors = 0; @@ -4468,8 +4794,7 @@ static void ovsdb_idl_db_update_has_lock(struct ovsdb_idl_db *db, bool new_has_lock) { if (new_has_lock && !db->has_lock) { - if (db->idl->state == IDL_S_MONITORING || - db->idl->state == IDL_S_MONITORING_COND) { + if (db->idl->state == IDL_S_MONITORING) { db->change_seqno++; } else { /* We're setting up a session, so don't signal that the database diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h index 975f9402b3b4..2f5655227ac1 100644 --- a/lib/ovsdb-idl.h +++ b/lib/ovsdb-idl.h @@ -63,6 +63,8 @@ struct ovsdb_idl *ovsdb_idl_create(const char *remote, void ovsdb_idl_set_remote(struct ovsdb_idl *, const char *, bool); void ovsdb_idl_destroy(struct ovsdb_idl *); +void ovsdb_idl_set_leader_only(struct ovsdb_idl *, bool leader_only); + void ovsdb_idl_run(struct ovsdb_idl *); void ovsdb_idl_wait(struct ovsdb_idl *); diff --git a/lib/ovsdb-server-idl.ann b/lib/ovsdb-server-idl.ann new file mode 100644 index 000000000000..ffb945b9134c --- /dev/null +++ b/lib/ovsdb-server-idl.ann @@ -0,0 +1,9 @@ +# -*- python -*- + +# This code, when invoked by "ovsdb-idlc annotate" (by the build +# process), annotates vswitch.ovsschema with additional data that give +# the ovsdb-idl engine information about the types involved, so that +# it can generate more programmer-friendly data structures. + +s["idlPrefix"] = "serverrec_" +s["idlHeader"] = "\"lib/ovsdb-server-idl.h\"" diff --git a/lib/ovsdb-session.c b/lib/ovsdb-session.c new file mode 100644 index 000000000000..a8cb90f225e1 --- /dev/null +++ b/lib/ovsdb-session.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ovsdb-session.h" +#include +#include +#include +#include "svec.h" +#include "util.h" +#include "uuid.h" + +static const char * +next_remote(const char *s) +{ + for (const char *delimiter = strchr(s, ','); delimiter; + delimiter = strchr(delimiter + 1, ',')) { + const char *p = delimiter + 1; + p += strspn(p, " \t"); + size_t n_letters = strspn(p, "abcdefghijklmnopqrstuvwxyz"); + if (n_letters && p[n_letters] == ':') { + return delimiter; + } + } + return NULL; +} + +/* Parses string 's' into comma-delimited substrings and adds each of them into + * 'remotes'. If one of the substrings is of the form "cid:", fills + * '*cid' with the UUID (and omits it from 'remotes'), otherwise initializes + * '*cid' to UUID_ZERO. */ +void +ovsdb_session_parse_remote(const char *s, + struct svec *remotes, struct uuid *cid) +{ + *cid = UUID_ZERO; + for (;;) { + /* Skip white space. */ + s += strspn(s, " \t"); + if (*s == '\0') { + break; + } + + /* Find the start of the next remote */ + const char *delimiter = next_remote(s); + if (!delimiter) { + svec_add(remotes, s); + break; + } + svec_add_nocopy(remotes, xmemdup0(s, delimiter - s)); + s = delimiter + 1; + } + + size_t i; + for (i = 0; i < remotes->n; i++) { + const char *name = remotes->names[i]; + struct uuid uuid; + if (!strncmp(name, "cid:", 4) && uuid_from_string(&uuid, name + 4)) { + *cid = uuid; + svec_del(remotes, name); + break; + } + } +} diff --git a/lib/ovsdb-session.h b/lib/ovsdb-session.h new file mode 100644 index 000000000000..88835cd3dd85 --- /dev/null +++ b/lib/ovsdb-session.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OVSDB_SESSION_H +#define OVSDB_SESSION_H 1 + +struct svec; +struct uuid; + +void ovsdb_session_parse_remote(const char *s, + struct svec *remotes, struct uuid *cid); + +#endif /* ovsdb-session.h */ diff --git a/lib/uuid.h b/lib/uuid.h index 69a71cc60952..fa49354f6680 100644 --- a/lib/uuid.h +++ b/lib/uuid.h @@ -61,6 +61,18 @@ uuid_equals(const struct uuid *a, const struct uuid *b) && a->parts[3] == b->parts[3]); } +/* Returns the first 'n' hex digits of 'uuid', for 0 < 'n' <= 8. + * + * This is useful for displaying a few leading digits of the uuid, e.g. to + * display 4 digits: + * printf("%04x", uuid_prefix(uuid, 4)); + */ +static inline unsigned int +uuid_prefix(const struct uuid *uuid, int digits) +{ + return (uuid->parts[0] >> (32 - 4 * digits)); +} + void uuid_init(void); void uuid_generate(struct uuid *); struct uuid uuid_random(void); diff --git a/ovn/controller/ovn-controller.c b/ovn/controller/ovn-controller.c index c286ccbcaf8d..103261853952 100644 --- a/ovn/controller/ovn-controller.c +++ b/ovn/controller/ovn-controller.c @@ -615,6 +615,7 @@ main(int argc, char *argv[]) char *ovnsb_remote = get_ovnsb_remote(ovs_idl_loop.idl); struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER( ovsdb_idl_create(ovnsb_remote, &sbrec_idl_class, true, true)); + ovsdb_idl_set_leader_only(ovnsb_idl_loop.idl, false); create_ovnsb_indexes(ovnsb_idl_loop.idl); lport_init(ovnsb_idl_loop.idl); diff --git a/ovn/utilities/ovn-ctl b/ovn/utilities/ovn-ctl index 0e56bf8c501b..2b3c9215e5fa 100755 --- a/ovn/utilities/ovn-ctl +++ b/ovn/utilities/ovn-ctl @@ -95,10 +95,37 @@ promote_ovnsb() { start_nb_ovsdb() { # Check and eventually start ovsdb-server for Northbound DB - if ! pidfile_is_running $DB_NB_PID; then + if pidfile_is_running $DB_NB_PID; then + return + fi + + mode="standalone" + + if test ! -z "$DB_NB_CLUSTER_LOCAL_ADDR"; then + mode="cluster" + elif test ! -z "$DB_NB_SYNC_FROM_ADDR"; then + mode="active_passive" + echo "$DB_NB_SYNC_FROM_PROTO:$DB_NB_SYNC_FROM_ADDR:\ +$DB_NB_SYNC_FROM_PORT" > $ovnnb_active_conf_file + fi + + if test X$mode != "Xcluster"; then upgrade_db "$DB_NB_FILE" "$DB_NB_SCHEMA" 1>/dev/null 2>/dev/null + else + if test -z "$DB_NB_CLUSTER_REMOTE_ADDR"; then + create_cluster "$DB_NB_FILE" "$DB_NB_SCHEMA" \ +"$DB_NB_CLUSTER_LOCAL_ADDR" + else + join_cluster "$DB_NB_FILE" "OVN_Northbound" \ +"$DB_NB_CLUSTER_LOCAL_ADDR" "$DB_NB_CLUSTER_REMOTE_ADDR" + fi + fi - set ovsdb-server + set ovsdb-server + set "$@" --detach --monitor + set "$@" $OVN_NB_LOG --log-file=$OVN_NB_LOGFILE + set "$@" --remote=punix:$DB_NB_SOCK --pidfile=$DB_NB_PID + set "$@" --unixctl=ovnnb_db.ctl if test X"$DB_NB_DETACH" != Xno; then set "$@" --detach --monitor @@ -108,37 +135,65 @@ start_nb_ovsdb() { set "$@" $OVN_NB_LOG --log-file=$OVN_NB_LOGFILE set "$@" --remote=punix:$DB_NB_SOCK --pidfile=$DB_NB_PID + # TODO (numans): Remove this 'if' once we have the fix to + # start ovsdb-server with the below options for the cluster db. + if test X$mode != "Xcluster"; then set "$@" --remote=db:OVN_Northbound,NB_Global,connections - set "$@" --unixctl=ovnnb_db.ctl set "$@" --private-key=db:OVN_Northbound,SSL,private_key set "$@" --certificate=db:OVN_Northbound,SSL,certificate set "$@" --ca-cert=db:OVN_Northbound,SSL,ca_cert set "$@" --ssl-protocols=db:OVN_Northbound,SSL,ssl_protocols set "$@" --ssl-ciphers=db:OVN_Northbound,SSL,ssl_ciphers + fi - if test X"$DB_NB_CREATE_INSECURE_REMOTE" = Xyes; then - set "$@" --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR - fi + if test X"$DB_NB_CREATE_INSECURE_REMOTE" = Xyes; then + set "$@" --remote=ptcp:$DB_NB_PORT:$DB_NB_ADDR + fi - if test ! -z "$DB_NB_SYNC_FROM_ADDR"; then - echo "$DB_NB_SYNC_FROM_PROTO:$DB_NB_SYNC_FROM_ADDR:$DB_NB_SYNC_FROM_PORT" > $ovnnb_active_conf_file - fi + if test -e $ovnnb_active_conf_file; then + set "$@" --sync-from=`cat $ovnnb_active_conf_file` + fi - if test -e $ovnnb_active_conf_file; then - set "$@" --sync-from=`cat $ovnnb_active_conf_file` - fi + $@ $DB_NB_FILE - $@ $DB_NB_FILE + if test -z "$DB_NB_CLUSTER_REMOTE_ADDR"; then ovn-nbctl init fi } start_sb_ovsdb() { - # Check and eventually start ovsdb-server for Southbound DB - if ! pidfile_is_running $DB_SB_PID; then + # Check and eventually start ovsdb-server for Northbound DB + if pidfile_is_running $DB_SB_PID; then + return + fi + + mode="standalone" + + if test ! -z "$DB_SB_CLUSTER_LOCAL_ADDR"; then + mode="cluster" + elif test ! -z "$DB_SB_SYNC_FROM_ADDR"; then + mode="active_passive" + echo "$DB_SB_SYNC_FROM_PROTO:$DB_SB_SYNC_FROM_ADDR:\ +$DB_SB_SYNC_FROM_PORT" > $ovnsb_active_conf_file + fi + + if test X$mode != "Xcluster"; then upgrade_db "$DB_SB_FILE" "$DB_SB_SCHEMA" 1>/dev/null 2>/dev/null + else + if test -z "$DB_SB_CLUSTER_REMOTE_ADDR"; then + create_cluster "$DB_SB_FILE" "$DB_SB_SCHEMA" \ +"$DB_SB_CLUSTER_LOCAL_ADDR" + else + join_cluster "$DB_SB_FILE" "OVN_Southbound" \ +"$DB_SB_CLUSTER_LOCAL_ADDR" "$DB_SB_CLUSTER_REMOTE_ADDR" + fi + fi - set ovsdb-server + set ovsdb-server + set "$@" --detach --monitor + set "$@" $OVN_SB_LOG --log-file=$OVN_SB_LOGFILE + set "$@" --remote=punix:$DB_SB_SOCK --pidfile=$DB_SB_PID + set "$@" --unixctl=ovnsb_db.ctl if test X"$DB_SB_DETACH" != Xno; then set "$@" --detach --monitor @@ -148,27 +203,28 @@ start_sb_ovsdb() { set "$@" $OVN_SB_LOG --log-file=$OVN_SB_LOGFILE set "$@" --remote=punix:$DB_SB_SOCK --pidfile=$DB_SB_PID + # TODO (numans): Remove this 'if' once we have the fix to + # start ovsdb-server with the below options for the cluster db. + if test X$mode != "Xcluster"; then set "$@" --remote=db:OVN_Southbound,SB_Global,connections - set "$@" --unixctl=ovnsb_db.ctl set "$@" --private-key=db:OVN_Southbound,SSL,private_key set "$@" --certificate=db:OVN_Southbound,SSL,certificate set "$@" --ca-cert=db:OVN_Southbound,SSL,ca_cert set "$@" --ssl-protocols=db:OVN_Southbound,SSL,ssl_protocols set "$@" --ssl-ciphers=db:OVN_Southbound,SSL,ssl_ciphers + fi - if test X"$DB_SB_CREATE_INSECURE_REMOTE" = Xyes; then - set "$@" --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR - fi + if test X"$DB_SB_CREATE_INSECURE_REMOTE" = Xyes; then + set "$@" --remote=ptcp:$DB_SB_PORT:$DB_SB_ADDR + fi - if test ! -z "$DB_SB_SYNC_FROM_ADDR"; then - echo "$DB_SB_SYNC_FROM_PROTO:$DB_SB_SYNC_FROM_ADDR:$DB_SB_SYNC_FROM_PORT" > $ovnsb_active_conf_file - fi + if test -e $ovnsb_active_conf_file; then + set "$@" --sync-from=`cat $ovnsb_active_conf_file` + fi - if test -e $ovnsb_active_conf_file; then - set "$@" --sync-from=`cat $ovnsb_active_conf_file` - fi + $@ $DB_SB_FILE - $@ $DB_SB_FILE + if test -z "$DB_SB_CLUSTER_REMOTE_ADDR"; then ovn-sbctl init fi } @@ -406,6 +462,11 @@ set_defaults () { DB_NB_DETACH="yes" DB_SB_DETACH="yes" + + DB_NB_CLUSTER_LOCAL_ADDR="" + DB_NB_CLUSTER_REMOTE_ADDR="" + DB_SB_CLUSTER_LOCAL_ADDR="" + DB_SB_CLUSTER_REMOTE_ADDR="" } set_option () { @@ -494,6 +555,14 @@ File location options: --db-sb-sync-from-port=ADDR OVN Southbound active db tcp port (default: $DB_SB_SYNC_FROM_PORT) --db-sb-sync-from-proto=PROTO OVN Southbound active db transport (default: $DB_SB_SYNC_FROM_PROTO) --db-sb-create-insecure-remote=yes|no Create ptcp OVN Southbound remote (default: $DB_SB_CREATE_INSECURE_REMOTE) + --db-nb-cluster-local-addr=ADDR OVN_Northbound cluster local address \ + (default: $DB_NB_CLUSTER_LOCAL_ADDR) + --db-nb-cluster-remote-addr=ADDR OVN_Northbound cluster remote address \ + (default: $DB_NB_CLUSTER_REMOTE_ADDR) + --db-sb-cluster-local-addr=ADDR OVN_Northbound cluster local address \ + (default: $DB_SB_CLUSTER_LOCAL_ADDR) + --db-sb-cluster-remote-addr=ADDR OVN_Northbound cluster remote address \ + (default: $DB_SB_CLUSTER_REMOTE_ADDR) Default directories with "configure" option and environment variable override: logs: /usr/local/var/log/openvswitch (--with-logdir, OVS_LOGDIR) diff --git a/ovn/utilities/ovn-nbctl.8.xml b/ovn/utilities/ovn-nbctl.8.xml index 3688d35b37c3..a8fdb7125dc7 100644 --- a/ovn/utilities/ovn-nbctl.8.xml +++ b/ovn/utilities/ovn-nbctl.8.xml @@ -919,6 +919,21 @@ default is unlikely to be useful outside of single-machine OVN test environments. + +
--leader-only
+
--no-leader-only
+
+ By default, or with --leader-only, when the database server + is a clustered database, ovn-nbctl will avoid servers other + than the cluster leader. This ensures that any data that + ovn-nbctl reads and reports is up-to-date. With + --no-leader-only, ovn-nbctl will use any server + in the cluster, which means that for read-only transactions it can report + and act on stale data (transactions that modify the database are always + serialized even with --no-leader-only). Refer to + Understanding Cluster Consistency in ovsdb(7) + for more information. +

Logging options

diff --git a/ovn/utilities/ovn-nbctl.c b/ovn/utilities/ovn-nbctl.c index c920ad878ab6..fccbc7cdd9fe 100644 --- a/ovn/utilities/ovn-nbctl.c +++ b/ovn/utilities/ovn-nbctl.c @@ -76,6 +76,9 @@ static struct ovsdb_idl *the_idl; static struct ovsdb_idl_txn *the_idl_txn; OVS_NO_RETURN static void nbctl_exit(int status); +/* --leader-only, --no-leader-only: Only accept the leader in a cluster. */ +static int leader_only = true; + static void nbctl_cmd_init(void); OVS_NO_RETURN static void usage(void); static void parse_options(int argc, char *argv[], struct shash *local_options); @@ -120,6 +123,7 @@ main(int argc, char *argv[]) /* Initialize IDL. */ idl = the_idl = ovsdb_idl_create(db, &nbrec_idl_class, true, false); + ovsdb_idl_set_leader_only(idl, leader_only); run_prerequisites(commands, n_commands, idl); /* Execute the commands. @@ -182,6 +186,8 @@ parse_options(int argc, char *argv[], struct shash *local_options) {"help", no_argument, NULL, 'h'}, {"commands", no_argument, NULL, OPT_COMMANDS}, {"options", no_argument, NULL, OPT_OPTIONS}, + {"leader-only", no_argument, &leader_only, true}, + {"no-leader-only", no_argument, &leader_only, false}, {"version", no_argument, NULL, 'V'}, VLOG_LONG_OPTIONS, STREAM_SSL_LONG_OPTIONS, @@ -300,6 +306,9 @@ parse_options(int argc, char *argv[], struct shash *local_options) default: abort(); + + case 0: + break; } } free(short_options); diff --git a/ovn/utilities/ovn-sbctl.8.in b/ovn/utilities/ovn-sbctl.8.in index cd43cf3beb6f..6e1c69350918 100644 --- a/ovn/utilities/ovn-sbctl.8.in +++ b/ovn/utilities/ovn-sbctl.8.in @@ -61,6 +61,19 @@ environments. \fIserver\fR may be an OVSDB active or passive connection method, e.g. \fBssl:192.168.10.5:6640\fR, as described in \fBovsdb\fR(7). . +.IP "\fB\-\-leader\-only\fR" +.IQ "\fB\-\-no\-leader\-only\fR" +By default, or with \fB\-\-leader\-only\fR, when the database server +is a clustered database, \fBovn\-sbctl\fR will avoid servers other +than the cluster leader. This ensures that any data that +\fBovn\-sbctl\fR reads and reports is up-to-date. With +\fB\-\-no\-leader\-only\fR, \fBovn\-sbctl\fR will use any server in +the cluster, which means that for read-only transactions it can report +and act on stale data (transactions that modify the database are +always serialized even with \fB\-\-no\-leader\-only\fR). Refer to +\fBUnderstanding Cluster Consistency\fR in \fBovsdb\fR(7) for more +information. +. .IP "\fB\-\-no\-syslog\fR" By default, \fBovn\-sbctl\fR logs its arguments and the details of any changes that it makes to the system log. This option disables this diff --git a/ovn/utilities/ovn-sbctl.c b/ovn/utilities/ovn-sbctl.c index f16cefedd897..9f09b83e8454 100644 --- a/ovn/utilities/ovn-sbctl.c +++ b/ovn/utilities/ovn-sbctl.c @@ -80,6 +80,9 @@ static struct ovsdb_idl *the_idl; static struct ovsdb_idl_txn *the_idl_txn; OVS_NO_RETURN static void sbctl_exit(int status); +/* --leader-only, --no-leader-only: Only accept the leader in a cluster. */ +static int leader_only = true; + static void sbctl_cmd_init(void); OVS_NO_RETURN static void usage(void); static void parse_options(int argc, char *argv[], struct shash *local_options); @@ -121,6 +124,7 @@ main(int argc, char *argv[]) /* Initialize IDL. */ idl = the_idl = ovsdb_idl_create(db, &sbrec_idl_class, false, true); + ovsdb_idl_set_leader_only(idl, leader_only); run_prerequisites(commands, n_commands, idl); /* Execute the commands. @@ -178,6 +182,8 @@ parse_options(int argc, char *argv[], struct shash *local_options) {"help", no_argument, NULL, 'h'}, {"commands", no_argument, NULL, OPT_COMMANDS}, {"options", no_argument, NULL, OPT_OPTIONS}, + {"leader-only", no_argument, &leader_only, true}, + {"no-leader-only", no_argument, &leader_only, false}, {"version", no_argument, NULL, 'V'}, VLOG_LONG_OPTIONS, STREAM_SSL_LONG_OPTIONS, @@ -273,6 +279,9 @@ parse_options(int argc, char *argv[], struct shash *local_options) default: abort(); + + case 0: + break; } } free(short_options); diff --git a/ovsdb/TODO.rst b/ovsdb/TODO.rst new file mode 100644 index 000000000000..90a4e77a1672 --- /dev/null +++ b/ovsdb/TODO.rst @@ -0,0 +1,61 @@ +.. + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + Convention for heading levels in Open vSwitch documentation: + + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + + Avoid deeper levels because they do not render well. + +=========================== +OVSDB Clustering To-do List +=========================== + +* Unit test snapshotting. + +* Locks. + +* Investigate 100% CPU for long-running triggers + +* Tons of unit tests. + +* Increase exponential backoff cap. Introduce randomization. + +* Include index with monitor update? + +* Back off when transaction fails to commit? Definitely back off until + the eid changes for prereq failures + +* Testing with replication. + +* Handling bad transactions in read_db(). (Kill the database?) + +* Documentation: + + * ACID (and CAP?) explanation. + + * Upgrading OVN to a clustered database + + * Installing OVN with a clustered database + + * Overall diagram explaining the cluster and ovsdb protocol pieces + +* Future work: + + * File format with diff support. + + * Future work: DNS or directory support diff --git a/ovsdb/_server.ovsschema b/ovsdb/_server.ovsschema index 8997bae5fa36..a867e5cbf256 100644 --- a/ovsdb/_server.ovsschema +++ b/ovsdb/_server.ovsschema @@ -1,9 +1,21 @@ {"name": "_Server", - "version": "1.0.0", - "cksum": "3931859656 185", + "version": "1.1.0", + "cksum": "3236486585 698", "tables": { "Database": { "columns": { "name": {"type": "string"}, - "schema": {"type": "string"}}, + "model": { + "type": {"key": {"type": "string", + "enum": ["set", ["standalone", "clustered"]]}}}, + "connected": {"type": "boolean"}, + "leader": {"type": "boolean"}, + "schema": { + "type": {"key": {"type": "string"}, "min": 0, "max": 1}}, + "cid": { + "type": {"key": {"type": "uuid"}, "min": 0, "max": 1}}, + "sid": { + "type": {"key": {"type": "uuid"}, "min": 0, "max": 1}}, + "index": { + "type": {"key": {"type": "integer"}, "min": 0, "max": 1}}}, "isRoot": true}}} diff --git a/ovsdb/_server.xml b/ovsdb/_server.xml index 8ef782fb97b2..e4536671ccbe 100644 --- a/ovsdb/_server.xml +++ b/ovsdb/_server.xml @@ -37,13 +37,13 @@

When a database is removed from the server, in addition to - Database table updates, the server sends cancel - messages, as described in RFC 7047 section 4.1.4, in reply to outstanding - transactions for the removed database. The server also cancels any - outstanding monitoring initiated by monitor or - monitor_cond requested on the removed database, sending the - monitor_canceled RPC described in - ovsdb-server(5). Only clients that disable disconnection + Database table updates, the server sends + canceled messages, as described in RFC 7047 section 4.1.4, + in reply to outstanding transactions for the removed database. The + server also cancels any outstanding monitoring initiated by + monitor or monitor_cond requested on the + removed database, sending the monitor_canceled RPC described + in ovsdb-server(5). Only clients that disable disconnection with set_db_change_aware receive these messages.

@@ -58,8 +58,63 @@ The database's name, as specified in its schema. + + The storage model: standalone for a standalone or + active-backup database, clustered for a clustered database. + + - The database schema, as a JSON string. + The database schema, as a JSON string. Until a clustered database + finishes joining its cluster, this is empty. + + +

+ These columns are most interesting and in some cases only relevant for + clustered databases, that is, those where the + column is clustered. +

+ + + True if the database is connected to its storage. A standalone or + active-backup database is always connected. A clustered database is + connected if the server is in contact with a majority of its cluster. + An unconnected database cannot be modified and its data might be + unavailable or stale. + + + + True if the database is the leader in its cluster. For a standalone or + active-backup database, this is always true. + + + + The cluster ID for this database, which is the same for all of the + servers that host this particular clustered database. For a standalone + or active-backup database, this is empty. + + + + The server ID for this database, different for each server that hosts a + particular clustered database. A server that hosts more than one + clustered database will have a different sid in each one. + For a standalone or active-backup database, this is empty. + + + +

+ For a clustered database, the index of the log entry currently + exposed to clients. For a given server, this increases + monotonically. When a client switches from one server to another in + a cluster, it can ensure that it never sees an older snapshot of data + by avoiding servers that have less than the + largest value they have already observed. +

+ +

+ For a standalone or active-backup database, this is empty. +

+
+
diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk index c90e2e5b77f9..4d8909034afa 100644 --- a/ovsdb/automake.mk +++ b/ovsdb/automake.mk @@ -24,6 +24,12 @@ ovsdb_libovsdb_la_SOURCES = \ ovsdb/monitor.h \ ovsdb/query.c \ ovsdb/query.h \ + ovsdb/raft.c \ + ovsdb/raft.h \ + ovsdb/raft-private.c \ + ovsdb/raft-private.h \ + ovsdb/raft-rpc.c \ + ovsdb/raft-rpc.h \ ovsdb/rbac.c \ ovsdb/rbac.h \ ovsdb/replication.c \ @@ -32,6 +38,8 @@ ovsdb_libovsdb_la_SOURCES = \ ovsdb/row.h \ ovsdb/server.c \ ovsdb/server.h \ + ovsdb/storage.c \ + ovsdb/storage.h \ ovsdb/table.c \ ovsdb/table.h \ ovsdb/trigger.c \ @@ -135,3 +143,5 @@ ovsdb/ovsdb-server.5: \ $(srcdir)/ovsdb/_server.ovsschema \ $(srcdir)/ovsdb/_server.xml > $@.tmp && \ mv $@.tmp $@ + +EXTRA_DIST += ovsdb/TODO.rst diff --git a/ovsdb/execution.c b/ovsdb/execution.c index 6a62ba2ef3c2..38303c7dbed1 100644 --- a/ovsdb/execution.c +++ b/ovsdb/execution.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ #include +#include "ovsdb.h" + #include #include "column.h" @@ -25,7 +27,6 @@ #include "ovsdb-data.h" #include "ovsdb-error.h" #include "ovsdb-parser.h" -#include "ovsdb.h" #include "query.h" #include "rbac.h" #include "row.h" @@ -97,11 +98,20 @@ lookup_executor(const char *name, bool *read_only) return NULL; } -struct json * -ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session, - const struct json *params, bool read_only, - const char *role, const char *id, - long long int elapsed_msec, long long int *timeout_msec) +/* On success, returns a transaction and stores the results to return to the + * client in '*resultsp'. + * + * On failure, returns NULL. If '*resultsp' is nonnull, then it is the results + * to return to the client. If '*resultsp' is null, then the execution failed + * due to an unsatisfied "wait" operation and '*timeout_msec' is the time at + * which the transaction will time out. (If 'timeout_msec' is null, this case + * never occurs--instead, an unsatisfied "wait" unconditionally fails.) */ +struct ovsdb_txn * +ovsdb_execute_compose(struct ovsdb *db, const struct ovsdb_session *session, + const struct json *params, bool read_only, + const char *role, const char *id, + long long int elapsed_msec, long long int *timeout_msec, + bool *durable, struct json **resultsp) { struct ovsdb_execution x; struct ovsdb_error *error; @@ -109,6 +119,7 @@ ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session, size_t n_operations; size_t i; + *durable = false; if (params->type != JSON_ARRAY || !params->u.array.n || params->u.array.elems[0]->type != JSON_STRING @@ -120,7 +131,8 @@ ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session, "as first parameter"); } - return ovsdb_error_to_json_free(error); + *resultsp = ovsdb_error_to_json_free(error); + return NULL; } x.db = db; @@ -188,43 +200,56 @@ ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session, } if (error) { json_destroy(result); - result = ovsdb_error_to_json(error); - } - if (error && !strcmp(ovsdb_error_get_tag(error), "not supported") - && timeout_msec) { - ovsdb_txn_abort(x.txn); - *timeout_msec = x.timeout_msec; - - json_destroy(result); - json_destroy(results); - results = NULL; - goto exit; - } - - /* Add result to array. */ - json_array_add(results, result); - if (error) { - break; - } - } - - if (!error) { - error = ovsdb_txn_commit(x.txn, x.durable); - if (error) { json_array_add(results, ovsdb_error_to_json(error)); + if (!strcmp(ovsdb_error_get_tag(error), "not supported") + && timeout_msec) { + *timeout_msec = x.timeout_msec; + json_destroy(results); + results = NULL; + goto exit; + } + break; } - } else { - ovsdb_txn_abort(x.txn); + json_array_add(results, result); } - while (json_array(results)->n < n_operations) { json_array_add(results, json_null_create()); } exit: - ovsdb_error_destroy(error); + if (error) { + ovsdb_txn_abort(x.txn); + x.txn = NULL; + + ovsdb_error_destroy(error); + } + *resultsp = results; + *durable = x.durable; ovsdb_symbol_table_destroy(x.symtab); + return x.txn; +} + +struct json * +ovsdb_execute(struct ovsdb *db, const struct ovsdb_session *session, + const struct json *params, bool read_only, + const char *role, const char *id, + long long int elapsed_msec, long long int *timeout_msec) +{ + bool durable; + struct json *results; + struct ovsdb_txn *txn = ovsdb_execute_compose( + db, session, params, read_only, role, id, elapsed_msec, timeout_msec, + &durable, &results); + if (!txn) { + return results; + } + + struct ovsdb_error *error = ovsdb_txn_propose_commit_block(txn, durable); + if (error) { + json_array_add(results, ovsdb_error_to_json(error)); + ovsdb_error_destroy(error); + } return results; } diff --git a/ovsdb/file.c b/ovsdb/file.c index dadb988d3088..d01e54fbe6ae 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -30,6 +30,7 @@ #include "ovsdb-error.h" #include "row.h" #include "socket-util.h" +#include "storage.h" #include "table.h" #include "timeval.h" #include "transaction.h" @@ -39,13 +40,6 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_file); -/* Minimum number of milliseconds between database compactions. */ -#define COMPACT_MIN_MSEC (10 * 60 * 1000) /* 10 minutes. */ - -/* Minimum number of milliseconds between trying to compact the database if - * compacting fails. */ -#define COMPACT_RETRY_MSEC (60 * 1000) /* 1 minute. */ - /* A transaction being converted to JSON for writing to a file. */ struct ovsdb_file_txn { struct json *json; /* JSON for the whole transaction. */ @@ -58,215 +52,6 @@ static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *, const struct ovsdb_row *old, const struct ovsdb_row *new, const unsigned long int *changed); -static struct ovsdb_error *ovsdb_file_txn_commit(struct json *, - const char *comment, - bool durable, - struct ovsdb_log *); - -static struct ovsdb_error *ovsdb_file_open__(const char *file_name, - const struct ovsdb_schema *, - bool read_only, struct ovsdb **, - struct ovsdb_file **); -static struct ovsdb_error *ovsdb_file_txn_from_json( - struct ovsdb *, const struct json *, bool converting, struct ovsdb_txn **); -static struct ovsdb_error *ovsdb_file_create(struct ovsdb *, - struct ovsdb_log *, - const char *file_name, - unsigned int n_transactions, - struct ovsdb_file **filep); - -/* Opens database 'file_name' and stores a pointer to the new database in - * '*dbp'. If 'read_only' is false, then the database will be locked and - * changes to the database will be written to disk. If 'read_only' is true, - * the database will not be locked and changes to the database will persist - * only as long as the "struct ovsdb". - * - * If 'filep' is nonnull and 'read_only' is false, then on success sets - * '*filep' to an ovsdb_file that represents the open file. This ovsdb_file - * persists until '*dbp' is destroyed. - * - * On success, returns NULL. On failure, returns an ovsdb_error (which the - * caller must destroy) and sets '*dbp' and '*filep' to NULL. */ -struct ovsdb_error * -ovsdb_file_open(const char *file_name, bool read_only, - struct ovsdb **dbp, struct ovsdb_file **filep) -{ - return ovsdb_file_open__(file_name, NULL, read_only, dbp, filep); -} - -/* Opens database 'file_name' with an alternate schema. The specified 'schema' - * is used to interpret the data in 'file_name', ignoring the schema actually - * stored in the file. Data in the file for tables or columns that do not - * exist in 'schema' are ignored, but the ovsdb file format must otherwise be - * observed, including column constraints. - * - * This function can be useful for upgrading or downgrading databases to - * "almost-compatible" formats. - * - * The database will not be locked. Changes to the database will persist only - * as long as the "struct ovsdb". - * - * On success, stores a pointer to the new database in '*dbp' and returns a - * null pointer. On failure, returns an ovsdb_error (which the caller must - * destroy) and sets '*dbp' to NULL. */ -struct ovsdb_error * -ovsdb_file_open_as_schema(const char *file_name, - const struct ovsdb_schema *schema, - struct ovsdb **dbp) -{ - return ovsdb_file_open__(file_name, schema, true, dbp, NULL); -} - -static struct ovsdb_error * -ovsdb_file_open_log(const char *file_name, enum ovsdb_log_open_mode open_mode, - struct ovsdb_log **logp, struct ovsdb_schema **schemap) -{ - struct ovsdb_schema *schema = NULL; - struct ovsdb_log *log = NULL; - struct ovsdb_error *error; - struct json *json = NULL; - - ovs_assert(logp || schemap); - - error = ovsdb_log_open(file_name, OVSDB_MAGIC, open_mode, -1, &log); - if (error) { - goto error; - } - - error = ovsdb_log_read(log, &json); - if (error) { - goto error; - } else if (!json) { - error = ovsdb_io_error(EOF, "%s: database file contains no schema", - file_name); - goto error; - } - - if (schemap) { - error = ovsdb_schema_from_json(json, &schema); - if (error) { - error = ovsdb_wrap_error(error, - "failed to parse \"%s\" as ovsdb schema", - file_name); - goto error; - } - } - json_destroy(json); - - if (logp) { - *logp = log; - } else { - ovsdb_log_close(log); - } - if (schemap) { - *schemap = schema; - } - return NULL; - -error: - ovsdb_log_close(log); - json_destroy(json); - if (logp) { - *logp = NULL; - } - if (schemap) { - *schemap = NULL; - } - return error; -} - -static struct ovsdb_error * -ovsdb_file_open__(const char *file_name, - const struct ovsdb_schema *alternate_schema, - bool read_only, struct ovsdb **dbp, - struct ovsdb_file **filep) -{ - enum ovsdb_log_open_mode open_mode; - struct ovsdb_schema *schema = NULL; - struct ovsdb_error *error; - struct ovsdb_log *log; - struct json *json; - struct ovsdb *db = NULL; - - /* In read-only mode there is no ovsdb_file so 'filep' must be null. */ - ovs_assert(!(read_only && filep)); - - open_mode = read_only ? OVSDB_LOG_READ_ONLY : OVSDB_LOG_READ_WRITE; - error = ovsdb_file_open_log(file_name, open_mode, &log, - alternate_schema ? NULL : &schema); - if (error) { - goto error; - } - - db = ovsdb_create(schema ? schema : ovsdb_schema_clone(alternate_schema)); - - /* When a log gets big, we compact it into a new log that initially has - * only a single transaction that represents the entire state of the - * database. Thus, we consider the first transaction in the database to be - * the snapshot. We measure its size to later influence the minimum log - * size before compacting again. - * - * The schema precedes the snapshot in the log; we could compensate for its - * size, but it's just not that important. */ - unsigned int n_transactions = 0; - while ((error = ovsdb_log_read(log, &json)) == NULL && json) { - struct ovsdb_txn *txn; - - error = ovsdb_file_txn_from_json(db, json, alternate_schema != NULL, - &txn); - json_destroy(json); - if (error) { - ovsdb_log_unread(log); - break; - } - - n_transactions++; - error = ovsdb_txn_commit(txn, false); - if (error) { - ovsdb_log_unread(log); - break; - } - - if (n_transactions == 1) { - ovsdb_log_mark_base(log); - } - } - if (error) { - /* Log error but otherwise ignore it. Probably the database just got - * truncated due to power failure etc. and we should use its current - * contents. */ - char *msg = ovsdb_error_to_string_free(error); - VLOG_ERR("%s", msg); - free(msg); - } - - if (!read_only) { - struct ovsdb_file *file; - - error = ovsdb_file_create(db, log, file_name, n_transactions, &file); - if (error) { - goto error; - } - if (filep) { - *filep = file; - } - db->file = file; - } else { - ovsdb_log_close(log); - } - - *dbp = db; - return NULL; - -error: - *dbp = NULL; - if (filep) { - *filep = NULL; - } - ovsdb_destroy(db); - ovsdb_log_close(log); - return error; -} static struct ovsdb_error * ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, @@ -377,7 +162,7 @@ ovsdb_file_txn_table_from_json(struct ovsdb_txn *txn, * If 'converting' is true, then unknown table and column names are ignored * (which can ease upgrading and downgrading schemas); otherwise, they are * treated as errors. */ -static struct ovsdb_error * +struct ovsdb_error * ovsdb_file_txn_from_json(struct ovsdb *db, const struct json *json, bool converting, struct ovsdb_txn **txnp) { @@ -425,123 +210,92 @@ error: return error; } -static struct ovsdb_error * -ovsdb_file_save_copy__(const char *file_name, int locking, - const char *comment, const struct ovsdb *db, - struct ovsdb_log **logp) +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_convert_table(struct ovsdb_txn *txn, + const struct ovsdb_table *src_table, + struct ovsdb_table *dst_table) { - const struct shash_node *node; - struct ovsdb_file_txn ftxn; - struct ovsdb_error *error; - struct ovsdb_log *log; - struct json *json; - - error = ovsdb_log_open(file_name, OVSDB_MAGIC, - OVSDB_LOG_CREATE_EXCL, locking, &log); - if (error) { - return error; - } + const struct ovsdb_row *src_row; + HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { + struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); + *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); - /* Write schema. */ - json = ovsdb_schema_to_json(db->schema); - error = ovsdb_log_write(log, json); - json_destroy(json); - if (error) { - goto exit; - } + struct shash_node *node; + SHASH_FOR_EACH (node, &src_table->schema->columns) { + const struct ovsdb_column *src_column = node->data; + if (src_column->index == OVSDB_COL_UUID || + src_column->index == OVSDB_COL_VERSION) { + continue; + } - /* Write data. */ - ovsdb_file_txn_init(&ftxn); - SHASH_FOR_EACH (node, &db->tables) { - const struct ovsdb_table *table = node->data; - const struct ovsdb_row *row; + const struct ovsdb_column *dst_column + = shash_find_data(&dst_table->schema->columns, + src_column->name); + if (!dst_column) { + continue; + } - HMAP_FOR_EACH (row, hmap_node, &table->rows) { - ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL); + struct ovsdb_error *error = ovsdb_datum_convert( + &dst_row->fields[dst_column->index], &dst_column->type, + &src_row->fields[src_column->index], &src_column->type); + if (error) { + ovsdb_row_destroy(dst_row); + return error; + } } - } - error = ovsdb_file_txn_commit(ftxn.json, comment, true, log); -exit: - if (logp) { - if (!error) { - *logp = log; - log = NULL; - } else { - *logp = NULL; - } - } - ovsdb_log_close(log); - if (error) { - remove(file_name); + ovsdb_txn_row_insert(txn, dst_row); } - return error; + return NULL; } -/* Saves a snapshot of 'db''s current contents as 'file_name'. If 'comment' is - * nonnull, then it is added along with the data contents and can be viewed - * with "ovsdb-tool show-log". - * - * 'locking' is passed along to ovsdb_log_open() untouched. */ -struct ovsdb_error * -ovsdb_file_save_copy(const char *file_name, int locking, - const char *comment, const struct ovsdb *db) +/* Copies the data in 'src', converts it into the schema specified in + * 'new_schema', and puts it into a newly created, unbacked database, and + * stores a pointer to the new database in '*dstp'. Returns null if + * successful, otherwise an error; on error, stores NULL in '*dstp'. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema, + struct ovsdb **dstp) { - return ovsdb_file_save_copy__(file_name, locking, comment, db, NULL); -} + struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema), + ovsdb_storage_create_unbacked()); + struct ovsdb_txn *txn = ovsdb_txn_create(dst); + struct ovsdb_error *error = NULL; -/* Opens database 'file_name', reads its schema, and closes it. On success, - * stores the schema into '*schemap' and returns NULL; the caller then owns the - * schema. On failure, returns an ovsdb_error (which the caller must destroy) - * and sets '*dbp' to NULL. */ -struct ovsdb_error * -ovsdb_file_read_schema(const char *file_name, struct ovsdb_schema **schemap) -{ - ovs_assert(schemap != NULL); - return ovsdb_file_open_log(file_name, OVSDB_LOG_READ_ONLY, NULL, schemap); -} - -struct ovsdb_file { - struct ovsdb *db; - struct ovsdb_log *log; - char *file_name; - long long int last_compact; - long long int next_compact; - unsigned int n_transactions; -}; + struct shash_node *node; + SHASH_FOR_EACH (node, &src->tables) { + const char *table_name = node->name; + struct ovsdb_table *src_table = node->data; + struct ovsdb_table *dst_table = shash_find_data(&dst->tables, + table_name); + if (!dst_table) { + continue; + } -static struct ovsdb_error * -ovsdb_file_create(struct ovsdb *db, struct ovsdb_log *log, - const char *file_name, - unsigned int n_transactions, struct ovsdb_file **filep) -{ - struct ovsdb_file *file; - char *deref_name; - char *abs_name; - - /* Use the absolute name of the file because ovsdb-server opens its - * database before daemonize() chdirs to "/". */ - deref_name = follow_symlinks(file_name); - abs_name = abs_file_name(NULL, deref_name); - free(deref_name); - if (!abs_name) { - *filep = NULL; - return ovsdb_io_error(0, "could not determine current " - "working directory"); + error = ovsdb_convert_table(txn, src_table, dst_table); + if (error) { + goto error; + } } - file = xmalloc(sizeof *file); - file->db = db; - file->log = log; - file->file_name = abs_name; - file->last_compact = time_msec(); - file->next_compact = file->last_compact + COMPACT_MIN_MSEC; - file->n_transactions = n_transactions; + error = ovsdb_txn_replay_commit(txn); + if (error) { + txn = NULL; /* ovsdb_txn_replay_commit() already aborted. */ + goto error; + } - *filep = file; + *dstp = dst; return NULL; -} +error: + ovsdb_destroy(dst); + if (txn) { + ovsdb_txn_abort(txn); + } + *dstp = NULL; + return error; +} + static bool ovsdb_file_change_cb(const struct ovsdb_row *old, const struct ovsdb_row *new, @@ -554,22 +308,29 @@ ovsdb_file_change_cb(const struct ovsdb_row *old, } struct json * -ovsdb_file_txn_annotate(struct json *json, const char *comment) +ovsdb_to_txn_json(const struct ovsdb *db, const char *comment) { - if (!json) { - json = json_object_create(); - } - if (comment) { - json_object_put_string(json, "_comment", comment); + struct ovsdb_file_txn ftxn; + + ovsdb_file_txn_init(&ftxn); + + struct shash_node *node; + SHASH_FOR_EACH (node, &db->tables) { + const struct ovsdb_table *table = node->data; + const struct ovsdb_row *row; + + HMAP_FOR_EACH (row, hmap_node, &table->rows) { + ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL); + } } - json_object_put(json, "_date", json_integer_create(time_wall_msec())); - return json; + + return ovsdb_file_txn_annotate(ftxn.json, comment); } /* Returns 'txn' transformed into the JSON format that is used in OVSDB files. * (But the caller must use ovsdb_file_txn_annotate() to add the _comment the * _date members.) If 'txn' doesn't actually change anything, returns NULL */ -static struct json * +struct json * ovsdb_file_txn_to_json(const struct ovsdb_txn *txn) { struct ovsdb_file_txn ftxn; @@ -579,192 +340,17 @@ ovsdb_file_txn_to_json(const struct ovsdb_txn *txn) return ftxn.json; } -struct ovsdb_error * -ovsdb_file_commit(struct ovsdb_file *file, - const struct ovsdb_txn *txn, bool durable) -{ - struct json *txn_json = ovsdb_file_txn_to_json(txn); - if (!txn_json) { - /* Nothing to commit. */ - return NULL; - } - - struct ovsdb_error *error = ovsdb_file_txn_commit( - txn_json, ovsdb_txn_get_comment(txn), durable, file->log); - if (error) { - return error; - } - file->n_transactions++; - - /* If it has been at least COMPACT_MIN_MSEC ms since the last time we - * compacted (or at least COMPACT_RETRY_MSEC ms since the last time we - * tried), and if there are at least 100 transactions in the database, and - * if the database is at least 10 MB, and the database is at least 4x the - * size of the previous snapshot, then compact the database. */ - if (time_msec() >= file->next_compact - && file->n_transactions >= 100 - && ovsdb_log_grew_lots(file->log)) { - error = ovsdb_file_compact(file); - if (error) { - char *s = ovsdb_error_to_string_free(error); - VLOG_WARN("%s: compacting database failed (%s), retrying in " - "%d seconds", - file->file_name, s, COMPACT_RETRY_MSEC / 1000); - free(s); - - file->next_compact = time_msec() + COMPACT_RETRY_MSEC; - } - } - - return NULL; -} - -/* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if - * successful, otherwise an ovsdb_error that the caller must destroy. */ -static struct ovsdb_error * OVS_WARN_UNUSED_RESULT -ovsdb_rename(const char *old, const char *new) -{ -#ifdef _WIN32 - int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING - | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED) - ? 0 : EACCES); -#else - int error = rename(old, new) ? errno : 0; -#endif - - return (error - ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"", - old, new) - : NULL); -} - -struct ovsdb_error * -ovsdb_file_compact(struct ovsdb_file *file) +struct json * +ovsdb_file_txn_annotate(struct json *json, const char *comment) { - struct ovsdb_log *new_log = NULL; - struct lockfile *tmp_lock = NULL; - struct ovsdb_error *error; - char *tmp_name = NULL; - char *comment = NULL; - int retval; - - comment = xasprintf("compacting database online " - "(%.3f seconds old, %u transactions)", - (time_wall_msec() - file->last_compact) / 1000.0, - file->n_transactions); - VLOG_INFO("%s: %s", file->file_name, comment); - - /* Commit the old version, so that we can be assured that we'll eventually - * have either the old or the new version. */ - error = ovsdb_log_commit_block(file->log); - if (error) { - goto exit; - } - - /* Lock temporary file. */ - tmp_name = xasprintf("%s.tmp", file->file_name); - retval = lockfile_lock(tmp_name, &tmp_lock); - if (retval) { - error = ovsdb_io_error(retval, "could not get lock on %s", tmp_name); - goto exit; - } - - /* Remove temporary file. (It might not exist.) */ - if (unlink(tmp_name) < 0 && errno != ENOENT) { - error = ovsdb_io_error(errno, "failed to remove %s", tmp_name); - goto exit; - } - - /* Save a copy. */ - error = ovsdb_file_save_copy__(tmp_name, false, comment, file->db, - &new_log); - if (error) { - goto exit; - } - ovsdb_log_mark_base(new_log); - - /* Replace original file by the temporary file. - * - * We support two strategies: - * - * - The preferred strategy is to rename the temporary file over the - * original one in-place, then close the original one. This works on - * Unix-like systems. It does not work on Windows, which does not - * allow open files to be renamed. The approach has the advantage - * that, at any point, we can drop back to something that already - * works. - * - * - Alternatively, we can close both files, rename, then open the new - * file (which now has the original name). This works on all - * systems, but if reopening the file fails then we're stuck and have - * to abort (XXX although it would be better to retry). - * - * We make the strategy a variable instead of an #ifdef to make it easier - * to test both strategies on Unix-like systems, and to make the code - * easier to read. */ -#ifdef _WIN32 - bool rename_open_files = false; -#else - bool rename_open_files = true; -#endif - if (!rename_open_files) { - ovsdb_log_close(file->log); - ovsdb_log_close(new_log); - file->log = NULL; - new_log = NULL; - } - error = ovsdb_rename(tmp_name, file->file_name); - if (error) { - goto exit; - } - if (rename_open_files) { - fsync_parent_dir(file->file_name); - ovsdb_log_close(file->log); - file->log = new_log; - } else { - /* Re-open the log. This skips past the schema log record. */ - error = ovsdb_file_open_log(file->file_name, OVSDB_LOG_READ_WRITE, - &file->log, NULL); - if (error) { - ovs_fatal(0, "could not reopen database"); - } - - /* Skip past the data log reecord. */ - struct json *json; - error = ovsdb_log_read(file->log, &json); - if (error) { - ovs_fatal(0, "error reading database"); - } - json_destroy(json); - ovsdb_log_mark_base(file->log); + if (!json) { + json = json_object_create(); } - - /* Success! */ - file->last_compact = time_msec(); - file->next_compact = file->last_compact + COMPACT_MIN_MSEC; - file->n_transactions = 1; - -exit: - if (error) { - ovsdb_log_close(new_log); - if (tmp_lock) { - unlink(tmp_name); - } + if (comment) { + json_object_put_string(json, "_comment", comment); } - - lockfile_unlock(tmp_lock); - free(tmp_name); - free(comment); - - return error; -} - -void -ovsdb_file_destroy(struct ovsdb_file *file) -{ - ovsdb_log_close(file->log); - free(file->file_name); - free(file); + json_object_put(json, "_date", json_integer_create(time_wall_msec())); + return json; } static void @@ -830,138 +416,59 @@ ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, json_object_put(ftxn->table_json, uuid, row); } } - -static struct ovsdb_error * -ovsdb_file_txn_commit(struct json *json, const char *comment, - bool durable, struct ovsdb_log *log) -{ - struct ovsdb_error *error; - - json = ovsdb_file_txn_annotate(json, comment); - error = ovsdb_log_write(log, json); - json_destroy(json); - if (error) { - return ovsdb_wrap_error(error, "writing transaction failed"); - } - - if (durable) { - error = ovsdb_log_commit_block(log); - if (error) { - return ovsdb_wrap_error(error, "committing transaction failed"); - } - } - - return NULL; -} -static struct ovsdb_error * OVS_WARN_UNUSED_RESULT -ovsdb_convert_table(struct ovsdb_txn *txn, - const struct ovsdb_table *src_table, - struct ovsdb_table *dst_table) +static struct ovsdb * +ovsdb_file_read__(const char *filename, bool rw, + struct ovsdb_schema *new_schema) { - const struct ovsdb_row *src_row; - HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { - struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); - *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); - - struct shash_node *node; - SHASH_FOR_EACH (node, &src_table->schema->columns) { - const struct ovsdb_column *src_column = node->data; - if (src_column->index == OVSDB_COL_UUID || - src_column->index == OVSDB_COL_VERSION) { - continue; - } - - const struct ovsdb_column *dst_column - = shash_find_data(&dst_table->schema->columns, - src_column->name); - if (!dst_column) { - continue; - } - - struct ovsdb_error *error = ovsdb_datum_convert( - &dst_row->fields[dst_column->index], &dst_column->type, - &src_row->fields[src_column->index], &src_column->type); - if (error) { - ovsdb_row_destroy(dst_row); - return error; - } + struct ovsdb_storage *storage = ovsdb_storage_open_standalone(filename, + rw); + struct ovsdb_schema *schema = ovsdb_storage_read_schema(storage); + if (new_schema) { + ovsdb_schema_destroy(schema); + schema = new_schema; + } + struct ovsdb *ovsdb = ovsdb_create(schema, storage); + for (;;) { + /* Read a transaction. Bail if end-of-file. */ + struct json *txn_json; + struct ovsdb_schema *schema; + struct ovsdb_error *error = ovsdb_storage_read(storage, &schema, + &txn_json, NULL); + if (error) { + ovs_fatal(0, "%s", ovsdb_error_to_string_free(error)); } - - ovsdb_txn_row_insert(txn, dst_row); - } - return NULL; -} - -struct ovsdb_error * OVS_WARN_UNUSED_RESULT -ovsdb_file_convert(const struct ovsdb_file *file, - const struct ovsdb_schema *new_schema) -{ - struct ovsdb *new_db = ovsdb_create(ovsdb_schema_clone(new_schema)); - struct ovsdb_txn *txn = ovsdb_txn_create(new_db); - struct ovsdb_error *error = NULL; - - struct shash_node *node; - SHASH_FOR_EACH (node, &file->db->tables) { - const char *table_name = node->name; - const struct ovsdb_table *src_table = node->data; - struct ovsdb_table *dst_table = shash_find_data(&new_db->tables, - table_name); - if (!dst_table) { - continue; + ovs_assert(!schema); + if (!txn_json) { + break; } - error = ovsdb_convert_table(txn, src_table, dst_table); + /* Apply transaction to database. */ + struct ovsdb_txn *txn; + error = ovsdb_file_txn_from_json(ovsdb, txn_json, new_schema != NULL, + &txn); if (error) { - goto error; + ovs_fatal(0, "%s", ovsdb_error_to_string_free(error)); } - } - - error = ovsdb_txn_start_commit(txn); - if (error) { - goto error; - } - - struct ovsdb_log *new; - error = ovsdb_log_replace_start(file->log, &new); - if (error) { - goto error; - } - - /* Write schema. */ - struct json *schema_json = ovsdb_schema_to_json(new_schema); - error = ovsdb_log_write(new, schema_json); - json_destroy(schema_json); - if (error) { - goto error; - } - - /* Write data. */ - struct json *txn_json = ovsdb_file_txn_to_json(txn); - if (txn_json) { - error = ovsdb_log_write(new, txn_json); json_destroy(txn_json); + + error = ovsdb_txn_replay_commit(txn); if (error) { - goto error; + ovsdb_storage_unread(storage); + break; } } + return ovsdb; +} - error = ovsdb_log_replace_commit(file->log, new); - if (error) { - goto error; - } - - error = ovsdb_txn_finish_commit(txn, true); - ovs_assert(!error); /* Can't happen. */ - - ovsdb_replace(file->db, new_db); - - return NULL; +struct ovsdb * +ovsdb_file_read(const char *filename, bool rw) +{ + return ovsdb_file_read__(filename, rw, NULL); +} -error: - ovsdb_destroy(new_db); - if (txn) { - ovsdb_txn_abort(txn); - } - return error; +struct ovsdb * +ovsdb_file_read_as_schema(const char *filename, struct ovsdb_schema *schema) +{ + return ovsdb_file_read__(filename, false, schema); } diff --git a/ovsdb/file.h b/ovsdb/file.h index bc9b32cf6c33..40833a4d4f09 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -18,41 +18,27 @@ #include #include "compiler.h" -#include "log.h" struct ovsdb; -struct ovsdb_file; struct ovsdb_schema; struct ovsdb_txn; -struct ovsdb_error *ovsdb_file_open(const char *file_name, bool read_only, - struct ovsdb **, struct ovsdb_file **) - OVS_WARN_UNUSED_RESULT; - -struct ovsdb_error *ovsdb_file_open_as_schema(const char *file_name, - const struct ovsdb_schema *, - struct ovsdb **) - OVS_WARN_UNUSED_RESULT; - -struct ovsdb_error *ovsdb_file_save_copy(const char *file_name, int locking, - const char *comment, - const struct ovsdb *) - OVS_WARN_UNUSED_RESULT; - -struct ovsdb_error *ovsdb_file_compact(struct ovsdb_file *); - -struct ovsdb_error *ovsdb_file_read_schema(const char *file_name, - struct ovsdb_schema **) +struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment); +struct json *ovsdb_file_txn_to_json(const struct ovsdb_txn *); +struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); +struct ovsdb_error *ovsdb_file_txn_from_json(struct ovsdb *, + const struct json *, + bool converting, + struct ovsdb_txn **) OVS_WARN_UNUSED_RESULT; -struct ovsdb_error *ovsdb_file_commit(struct ovsdb_file *, - const struct ovsdb_txn *, bool durable); -void ovsdb_file_destroy(struct ovsdb_file *); - -struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); +struct ovsdb *ovsdb_file_read(const char *filename, bool rw); +struct ovsdb *ovsdb_file_read_as_schema(const char *filename, + struct ovsdb_schema *); -struct ovsdb_error *ovsdb_file_convert(const struct ovsdb_file *, - const struct ovsdb_schema *) +struct ovsdb_error *ovsdb_convert(const struct ovsdb *src, + const struct ovsdb_schema *new_schema, + struct ovsdb **dstp) OVS_WARN_UNUSED_RESULT; #endif /* ovsdb/file.h */ diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index df268cd4eedc..0df08da341be 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -34,6 +34,7 @@ #include "row.h" #include "server.h" #include "simap.h" +#include "storage.h" #include "stream.h" #include "table.h" #include "timeval.h" @@ -65,7 +66,8 @@ static void ovsdb_jsonrpc_session_get_memory_usage_all( const struct ovsdb_jsonrpc_remote *, struct simap *usage); static void ovsdb_jsonrpc_session_close_all(struct ovsdb_jsonrpc_remote *); static void ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *, - bool force); + bool force, + const char *comment); static void ovsdb_jsonrpc_session_set_all_options( struct ovsdb_jsonrpc_remote *, const struct ovsdb_jsonrpc_options *); static bool ovsdb_jsonrpc_active_session_get_status( @@ -164,14 +166,18 @@ ovsdb_jsonrpc_server_create(bool read_only) bool ovsdb_jsonrpc_server_add_db(struct ovsdb_jsonrpc_server *svr, struct ovsdb *db) { - ovsdb_jsonrpc_server_reconnect(svr, false); + ovsdb_jsonrpc_server_reconnect( + svr, false, xasprintf("adding %s database", db->name)); return ovsdb_server_add_db(&svr->up, db); } -/* Removes 'db' from the set of databases served out by 'svr'. */ +/* Removes 'db' from the set of databases served out by 'svr'. + * + * 'comment' should be a human-readable reason for removing the database. This + * function frees it. */ void ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *svr, - struct ovsdb *db) + struct ovsdb *db, char *comment) { struct shash_node *node; SHASH_FOR_EACH (node, &svr->remotes) { @@ -180,7 +186,7 @@ ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *svr, ovsdb_jsonrpc_session_preremove_db(remote, db); } - ovsdb_jsonrpc_server_reconnect(svr, false); + ovsdb_jsonrpc_server_reconnect(svr, false, comment); ovsdb_server_remove_db(&svr->up, db); } @@ -332,20 +338,24 @@ ovsdb_jsonrpc_server_free_remote_status( } /* Makes all of the JSON-RPC sessions managed by 'svr' to disconnect. (They - * will then generally reconnect.). + * will then generally reconnect.). Uses 'comment' as a human-readable comment + * for logging. Frees 'comment'. * * If 'force' is true, disconnects all sessions. Otherwise, disconnects only * sesions that aren't database change aware. */ void -ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr, bool force) +ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *svr, bool force, + char *comment) { struct shash_node *node; SHASH_FOR_EACH (node, &svr->remotes) { struct ovsdb_jsonrpc_remote *remote = node->data; - ovsdb_jsonrpc_session_reconnect_all(remote, force); + ovsdb_jsonrpc_session_reconnect_all(remote, force, comment); } + + free(comment); } bool @@ -360,7 +370,10 @@ ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *svr, { if (svr->read_only != read_only) { svr->read_only = read_only; - ovsdb_jsonrpc_server_reconnect(svr, false); + ovsdb_jsonrpc_server_reconnect(svr, false, + xstrdup(read_only + ? "making server read-only" + : "making server read/write")); } } @@ -636,19 +649,24 @@ ovsdb_jsonrpc_session_close_all(struct ovsdb_jsonrpc_remote *remote) } /* Makes all of the JSON-RPC sessions managed by 'remove' to disconnect. (They - * will then generally reconnect.). + * will then generally reconnect.). 'comment' should be a human-readable + * explanation of the reason for disconnection, for use in log messages. * * If 'force' is true, disconnects all sessions. Otherwise, disconnects only * sesions that aren't database change aware. */ static void ovsdb_jsonrpc_session_reconnect_all(struct ovsdb_jsonrpc_remote *remote, - bool force) + bool force, const char *comment) { struct ovsdb_jsonrpc_session *s, *next; LIST_FOR_EACH_SAFE (s, next, node, &remote->sessions) { if (force || !s->db_change_aware) { jsonrpc_session_force_reconnect(s->js); + if (jsonrpc_session_is_connected(s->js)) { + VLOG_INFO("%s: disconnecting (%s)", + jsonrpc_session_get_name(s->js), comment); + } if (!jsonrpc_session_is_alive(s->js)) { ovsdb_jsonrpc_session_close(s); } @@ -770,6 +788,15 @@ ovsdb_jsonrpc_lookup_db(const struct ovsdb_jsonrpc_session *s, goto error; } + if (!db->schema) { + error = ovsdb_error("database not available", + "%s request specifies database %s which is not " + "yet available because it has not completed " + "joining its cluster", + request->method, db_name); + goto error; + } + *replyp = NULL; return db; @@ -1099,7 +1126,10 @@ ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, } if (disconnect_all) { - ovsdb_jsonrpc_server_reconnect(s->remote->server, false); + ovsdb_jsonrpc_server_reconnect(s->remote->server, false, + xasprintf("committed %s database " + "schema conversion", + db->name)); } } @@ -1126,14 +1156,15 @@ ovsdb_jsonrpc_trigger_complete(struct ovsdb_jsonrpc_trigger *t) s = CONTAINER_OF(t->trigger.session, struct ovsdb_jsonrpc_session, up); if (jsonrpc_session_is_connected(s->js)) { - struct jsonrpc_msg *reply; - - reply = ovsdb_trigger_steal_reply(&t->trigger); - if (!reply) { - reply = jsonrpc_create_error(json_string_create("canceled"), - t->id); + bool complete = ovsdb_trigger_is_complete(&t->trigger); + if (s->db_change_aware && !complete) { + ovsdb_trigger_cancel(&t->trigger, "closing JSON-RPC session"); + complete = true; + } + if (complete) { + struct jsonrpc_msg *reply = ovsdb_trigger_steal_reply(&t->trigger); + ovsdb_jsonrpc_session_send(s, reply); } - ovsdb_jsonrpc_session_send(s, reply); } json_destroy(t->id); diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 0fc16f21b2d9..76ae1b5d607a 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -28,7 +28,7 @@ struct ovsdb_jsonrpc_server *ovsdb_jsonrpc_server_create(bool read_only); bool ovsdb_jsonrpc_server_add_db(struct ovsdb_jsonrpc_server *, struct ovsdb *); void ovsdb_jsonrpc_server_remove_db(struct ovsdb_jsonrpc_server *, - struct ovsdb *); + struct ovsdb *, char *comment); void ovsdb_jsonrpc_server_destroy(struct ovsdb_jsonrpc_server *); /* Options for a remote. */ @@ -64,7 +64,8 @@ bool ovsdb_jsonrpc_server_get_remote_status( void ovsdb_jsonrpc_server_free_remote_status( struct ovsdb_jsonrpc_remote_status *); -void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool force); +void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool force, + char *comment); void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *); void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *); diff --git a/ovsdb/log.c b/ovsdb/log.c index cc4bc2c6243e..aacd55bb16f3 100644 --- a/ovsdb/log.c +++ b/ovsdb/log.c @@ -645,6 +645,14 @@ ovsdb_log_write(struct ovsdb_log *file, const struct json *json) return NULL; } +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_log_write_and_free(struct ovsdb_log *log, struct json *json) +{ + struct ovsdb_error *error = ovsdb_log_write(log, json); + json_destroy(json); + return error; +} + /* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail. * Returns NULL if successful, otherwise the error that occurred. */ struct ovsdb_error * diff --git a/ovsdb/log.h b/ovsdb/log.h index bd0396f27ea8..90714ea13190 100644 --- a/ovsdb/log.h +++ b/ovsdb/log.h @@ -71,6 +71,8 @@ void ovsdb_log_compose_record(const struct json *, const char *magic, struct ovsdb_error *ovsdb_log_write(struct ovsdb_log *, const struct json *) OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *ovsdb_log_write_and_free(struct ovsdb_log *, struct json *) + OVS_WARN_UNUSED_RESULT; uint64_t ovsdb_log_commit_start(struct ovsdb_log *); uint64_t ovsdb_log_commit_progress(struct ovsdb_log *); diff --git a/ovsdb/ovsdb-client.1.in b/ovsdb/ovsdb-client.1.in index 727e9c6e0651..d0763b13facd 100644 --- a/ovsdb/ovsdb-client.1.in +++ b/ovsdb/ovsdb-client.1.in @@ -48,6 +48,8 @@ ovsdb\-client \- command-line interface to \fBovsdb-server\fR(1) .br \fBovsdb\-client\fR [\fIoptions\fR] \fBmonitor\-cond\fR [\fIserver\fR] [\fIdatabase\fR] \fIconditions \fItable\fR [\fIcolumn\fR[\fB,\fIcolumn\fR]...]... +.br +\fBovsdb\-client \fR[\fIoptions\fR] \fBwait\fR \fR[\fIserver\fR] \fIdatabase\fR \fIstate\fR .IP "Testing Commands:" \fBovsdb\-client\fR [\fIoptions\fR] \fBlock\fR [\fIserver\fR] \fIlock\fR .br @@ -57,6 +59,8 @@ ovsdb\-client \- command-line interface to \fBovsdb-server\fR(1) .br .IP "Other Commands:" \fBovsdb\-client help\fR +.IP "Cluster Options:" +[\fB\-\-no\-leader\-only\fR] .IP "Output formatting options:" [\fB\-\-format=\fIformat\fR] [\fB\-\-data=\fIformat\fR] @@ -80,6 +84,14 @@ be an OVSDB active or passive connection method, as described in and the default \fIdatabase\fR is \fBOpen_vSwitch\fR. .PP +\fBovsdb\-client\fR supports the +\fImethod1\fB,\fImethod2\fB,\fR...\fB,\fImethodN\fR syntax described +in \fBovsdb\fR(7) for connecting to a cluster. When this syntax is +used, \fBovsdb\-client\fR tries the cluster members in random order +until it finds the cluster leader. Specify the +\fB\-\-no\-leader\-only\fR option to instead accept any server that is +connected to the cluster. +.PP For an introduction to OVSDB and its implementation in Open vSwitch, see \fBovsdb\fR(7). .PP @@ -197,12 +209,12 @@ is specified, only those columns are retrieved. .IP "\fBbackup\fR [\fIserver\fR] [\fIdatabase\fR] \fB> \fIsnapshot\fR" Connects to \fIserver\fR, retrieves a snapshot of the schema and data in \fIdatabase\fR, and prints it on stdout in the format used for -OVSDB database files. This is an appropriate +OVSDB standalone and active-backup databases. This is an appropriate way to back up any remote database. The database snapshot that it outputs is suitable to be served up directly by \fBovsdb\-server\fR or used as the input to \fBovsdb\-client restore\fR. .IP -Another way to back up a is to +Another way to back up a standalone or active-backup database is to copy its database file, e.g. with \fBcp\fR. This is safe even if the database is in use. .IP @@ -220,8 +232,8 @@ transaction. .IP UUIDs for rows in the restored database will differ from those in \fIsnapshot\fR, because the OVSDB protocol does not allow clients to -specify row UUIDs. Another way to restore a database, -which does also restore row UUIDs, is to stop +specify row UUIDs. Another way to restore a standalone or active-backup +database, which does also restore row UUIDs, is to stop the server or servers, replace the database file by the snapshot, then restart the database. Either way, ephemeral columns are not restored, since by design they do not survive across restarts of @@ -279,6 +291,31 @@ prints the initial database contents. The \fBmonitor\fR command uses RFC 7047 "monitor" method to open a monitor session with the server. . +.IP "\fBwait\fR \fR[\fIserver\fR] \fIdatabase state\fR" +Waits for \fIdatabase\fR on \fIserver\fR to enter a desired \fIstate\fR, +which may be one of: +.RS +.IP "\fBadded\fR" +Waits until a database with the given name has been added to +\fIserver\fR. +.IP "\fBconnected\fR" +Waits until a database with the given name has been added to +\fIserver\fR. Then, if \fIdatabase\fR is clustered, additionally +waits until it has joined and connected to its cluster. +.IP "\fBremoved\fR" +Waits until \fIdatabase\fR has been removed from the database server. +This can also be used to wait for a database to complete leaving its +cluster, because \fBovsdb\-server\fR removes a database at that point. +.RE +.IP +\fIdatabase\fR is mandatory for this command because it is often used +to check for databases that have not yet been added to the server, so +that the \fBovsdb\-client\fR semantics of acting on a default database +do not work. +.IP +This command acts on a particular database server, not on a cluster, +so \fIserver\fR must name a single server, not a comma-delimited list +of servers. .SS "Testing commands" These commands are mostly of interest for testing the correctness of the OVSDB server. diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index b00f04147d39..de23cc14bbb0 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -40,13 +40,16 @@ #include "ovsdb.h" #include "ovsdb-data.h" #include "ovsdb-error.h" +#include "ovsdb-session.h" #include "openvswitch/poll-loop.h" #include "row.h" #include "sort.h" #include "svec.h" +#include "storage.h" #include "stream.h" #include "stream-ssl.h" #include "table.h" +#include "transaction.h" #include "monitor.h" #include "condition.h" #include "timeval.h" @@ -89,23 +92,155 @@ static int db_change_aware = -1; /* --force: Ignore schema differences for "restore" command? */ static bool force; +/* --leader-only, --no-leader-only: Only accept the leader in a cluster. */ +static bool leader_only = true; + /* Format for table output. */ static struct table_style table_style = TABLE_STYLE_DEFAULT; static const struct ovsdb_client_command *get_all_commands(void); +static struct json *parse_json(const char *); + OVS_NO_RETURN static void usage(void); static void parse_options(int argc, char *argv[]); static struct jsonrpc *open_jsonrpc(const char *server); static void fetch_dbs(struct jsonrpc *, struct svec *dbs); +static bool should_stay_connected(const char *server, const char *database, + const struct uuid *cid, + const struct jsonrpc_msg *reply); +struct jsonrpc_msg *create_database_info_request(const char *database); + +static char * +default_remote(void) +{ + return xasprintf("unix:%s/db.sock", ovs_rundir()); +} + +static int +open_rpc(int min_args, enum args_needed need, + int argc, char *argv[], struct jsonrpc **rpcp, char **databasep) +{ + struct svec remotes = SVEC_EMPTY_INITIALIZER; + struct uuid cid = UUID_ZERO; + + /* First figure out the remote(s). If the first command-line argument has + * the form of a remote, use it, otherwise use the default. */ + int argidx = 0; + if (argc > min_args && (isalpha((unsigned char) argv[0][0]) + && strchr(argv[0], ':'))) { + ovsdb_session_parse_remote(argv[argidx++], &remotes, &cid); + } else { + svec_add_nocopy(&remotes, default_remote()); + } + + /* Handle the case where there's one remote. In this case, if we need a + * database name, we try to figure out a default if none was specified + * explicitly. */ + char *database = *databasep; + if (remotes.n == 1) { + struct jsonrpc *rpc = open_jsonrpc(remotes.names[0]); + svec_destroy(&remotes); + + if (need == NEED_DATABASE && !database) { + struct svec dbs; + + svec_init(&dbs); + fetch_dbs(rpc, &dbs); + if (argc - argidx > min_args + && svec_contains(&dbs, argv[argidx])) { + database = xstrdup(argv[argidx++]); + } else if (svec_contains(&dbs, "Open_vSwitch")) { + database = xstrdup("Open_vSwitch"); + } else { + size_t n = 0; + const char *best = NULL; + for (size_t i = 0; i < dbs.n; i++) { + if (dbs.names[i][0] != '_') { + best = dbs.names[i]; + n++; + } + } + if (n != 1) { + jsonrpc_close(rpc); + ovs_fatal(0, "could not find a default database, " + "please specify a database name"); + } + database = xstrdup(best); + } + svec_destroy(&dbs); + } + *rpcp = rpc; + *databasep = database; + + return argidx; + } + + /* If there's more than one remote, and we need a database name, then it + * must be specified explicitly. It's too likely to cause surprising + * behavior if we try to pick a default across several servers. */ + if (!database && need == NEED_DATABASE) { + if (argc - argidx > min_args) { + database = xstrdup(argv[argidx++]); + } else { + ovs_fatal(0, "database name is required with multiple remotes"); + } + } + + /* We have multiple remotes. Connect to them in a random order and choose + * the first one that is up and hosts the database we want (if any) in an + * acceptable state. */ + struct jsonrpc_session *js = jsonrpc_session_open_multiple( + &remotes, false); + svec_destroy(&remotes); + + unsigned int seqno = 0; + struct json *id = NULL; + for (;;) { + jsonrpc_session_run(js); + if (!jsonrpc_session_is_alive(js)) { + ovs_fatal(0, "no servers were available"); + } + + if (seqno != jsonrpc_session_get_seqno(js) + && jsonrpc_session_is_connected(js)) { + if (!database) { + break; + } + + seqno = jsonrpc_session_get_seqno(js); + struct jsonrpc_msg *txn = create_database_info_request(database); + json_destroy(id); + id = json_clone(txn->id); + jsonrpc_session_send(js, txn); + } + + struct jsonrpc_msg *reply = jsonrpc_session_recv(js); + if (reply && id && reply->id && json_equal(id, reply->id)) { + if (reply->type == JSONRPC_REPLY + && should_stay_connected(jsonrpc_session_get_name(js), + database, &cid, reply)) { + break; + } + jsonrpc_session_force_reconnect(js); + } + jsonrpc_msg_destroy(reply); + + jsonrpc_session_recv_wait(js); + jsonrpc_session_wait(js); + poll_block(); + } + json_destroy(id); + + *rpcp = jsonrpc_session_steal(js); + *databasep = database; + return argidx; +} int main(int argc, char *argv[]) { const struct ovsdb_client_command *command; - char *database; - struct jsonrpc *rpc; - ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); service_start(&argc, &argv); @@ -127,50 +262,13 @@ main(int argc, char *argv[]) } optind++; + char *database = NULL; + struct jsonrpc *rpc = NULL; if (command->need != NEED_NONE) { - if (argc - optind > command->min_args - && (isalpha((unsigned char) argv[optind][0]) - && strchr(argv[optind], ':'))) { - rpc = open_jsonrpc(argv[optind++]); - } else { - char *sock = xasprintf("unix:%s/db.sock", ovs_rundir()); - rpc = open_jsonrpc(sock); - free(sock); - } - } else { - rpc = NULL; + optind += open_rpc(command->min_args, command->need, + argc - optind, argv + optind, &rpc, &database); } - if (command->need == NEED_DATABASE) { - struct svec dbs; - - svec_init(&dbs); - fetch_dbs(rpc, &dbs); - if (argc - optind > command->min_args - && svec_contains(&dbs, argv[optind])) { - database = xstrdup(argv[optind++]); - } else if (svec_contains(&dbs, "Open_vSwitch")) { - database = xstrdup("Open_vSwitch"); - } else { - size_t n = 0; - const char *best = NULL; - for (size_t i = 0; i < dbs.n; i++) { - if (dbs.names[i][0] != '_') { - best = dbs.names[i]; - n++; - } - } - if (n != 1) { - jsonrpc_close(rpc); - ovs_fatal(0, "no default database for `%s' command, please " - "specify a database name", command->name); - } - database = xstrdup(best); - } - svec_destroy(&dbs); - } else { - database = NULL; - } if (argc - optind < command->min_args || argc - optind > command->max_args) { @@ -201,6 +299,8 @@ parse_options(int argc, char *argv[]) OPT_BOOTSTRAP_CA_CERT = UCHAR_MAX + 1, OPT_TIMESTAMP, OPT_FORCE, + OPT_LEADER_ONLY, + OPT_NO_LEADER_ONLY, VLOG_OPTION_ENUMS, DAEMON_OPTION_ENUMS, TABLE_OPTION_ENUMS, @@ -214,6 +314,8 @@ parse_options(int argc, char *argv[]) {"db-change-aware", no_argument, &db_change_aware, 1}, {"no-db-change-aware", no_argument, &db_change_aware, 0}, {"timeout", required_argument, NULL, 't'}, + {"leader-only", no_argument, NULL, OPT_LEADER_ONLY}, + {"no-leader-only", no_argument, NULL, OPT_NO_LEADER_ONLY}, VLOG_LONG_OPTIONS, DAEMON_LONG_OPTIONS, #ifdef HAVE_OPENSSL @@ -271,6 +373,14 @@ parse_options(int argc, char *argv[]) } break; + case OPT_LEADER_ONLY: + leader_only = true; + break; + + case OPT_NO_LEADER_ONLY: + leader_only = false; + break; + case '?': exit(EXIT_FAILURE); @@ -324,6 +434,9 @@ usage(void) " convert database on SERVER named in SCHEMA to SCHEMA.\n" "\n monitor [SERVER] [DATABASE] ALL\n" " monitor all changes to all columns in all tables\n" + "\n wait [SERVER] DATABASE STATE\n" + " wait until DATABASE reaches STATE " + "(\"added\" or \"connected\" or \"removed\")\n" " in DATBASE on SERVER.\n" "\n dump [SERVER] [DATABASE]\n" " dump contents of DATABASE on SERVER to stdout\n" @@ -471,6 +584,141 @@ fetch_dbs(struct jsonrpc *rpc, struct svec *dbs) jsonrpc_msg_destroy(reply); svec_sort(dbs); } + +static const char * +parse_string_column(const struct json *row, const char *column_name) +{ + const struct json *column = shash_find_data(json_object(row), column_name); + return column && column->type == JSON_STRING ? json_string(column) : ""; +} + +static int +parse_boolean_column(const struct json *row, const char *column_name) +{ + const struct json *column = shash_find_data(json_object(row), column_name); + return (!column ? -1 + : column->type == JSON_TRUE ? true + : column->type == JSON_FALSE ? false + : -1); +} + +static struct uuid +parse_uuid_column(const struct json *row, const char *column_name) +{ + const struct json *column = shash_find_data(json_object(row), column_name); + if (!column) { + return UUID_ZERO; + } + + struct ovsdb_type type = { OVSDB_BASE_UUID_INIT, OVSDB_BASE_VOID_INIT, + 0, 1 }; + struct ovsdb_datum datum; + struct ovsdb_error *error = ovsdb_datum_from_json(&datum, &type, column, + NULL); + if (error) { + ovsdb_error_destroy(error); + return UUID_ZERO; + } + struct uuid uuid = datum.n > 0 ? datum.keys[0].uuid : UUID_ZERO; + ovsdb_datum_destroy(&datum, &type); + return uuid; +} + +struct jsonrpc_msg * +create_database_info_request(const char *database) +{ + struct json *op = json_object_create(); + json_object_put_string(op, "op", "select"); + json_object_put_string(op, "table", "Database"); + struct json *condition = json_array_create_3( + json_string_create("name"), + json_string_create("=="), + json_string_create(database)); + json_object_put(op, "where", json_array_create_1(condition)); + struct json *txn = json_array_create_2( + json_string_create("_Server"), op); + return jsonrpc_create_request("transact", txn, NULL); +} + +static const struct json * +parse_database_info_reply(const struct jsonrpc_msg *reply, const char *server, + const char *database, const struct uuid *cid) +{ + const struct json *result = reply->result; + if (result->type != JSON_ARRAY + || result->u.array.n != 1 + || result->u.array.elems[0]->type != JSON_OBJECT) { + VLOG_WARN("%s: unexpected reply to _Server request for %s", + server, database); + return NULL; + } + + const struct json *op_result = result->u.array.elems[0]; + const struct json *rows = shash_find_data(json_object(op_result), "rows"); + if (!rows || rows->type != JSON_ARRAY) { + VLOG_WARN("%s: missing \"rows\" member in _Server reply for %s", + server, database); + return NULL; + } + + for (size_t i = 0; i < rows->u.array.n; i++) { + const struct json *row = rows->u.array.elems[i]; + if (row->type != JSON_OBJECT) { + VLOG_WARN("%s: bad row in _Server reply for %s", + server, database); + continue; + } + + if (strcmp(parse_string_column(row, "name"), database)) { + continue; + } + + if (cid && !uuid_is_zero(cid)) { + struct uuid cid2 = parse_uuid_column(row, "cid"); + if (!uuid_equals(cid, &cid2)) { + continue; + } + } + + return row; + } + + /* No such database. */ + return NULL; +} + +/* Parses 'reply', a JSON-RPC reply to our request asking for the status of + * 'database' on 'server'. Determines whether this server is acceptable for + * the transaction we want to make and return true if so or false to disconnect + * and try a different server. */ +static bool +should_stay_connected(const char *server, const char *database, + const struct uuid *cid, const struct jsonrpc_msg *reply) +{ + const struct json *row = parse_database_info_reply(reply, server, + database, cid); + if (!row) { + /* No such database. */ + return false; + } + + if (strcmp(parse_string_column(row, "model"), "clustered")) { + /* Always accept standalone databases. */ + return true; + } + + if (!parse_boolean_column(row, "connected")) { + /* Reject disconnected servers. */ + return false; + } + + if (leader_only && !parse_boolean_column(row, "leader")) { + /* Reject if not leader.. */ + return false; + } + + return true; +} static void do_list_dbs(struct jsonrpc *rpc, const char *database OVS_UNUSED, @@ -599,9 +847,19 @@ send_db_change_aware(struct jsonrpc *rpc) } static struct json * -do_transact__(struct jsonrpc *rpc, struct json *transaction) +do_transact__(int argc, char *argv[], struct json *transaction) { struct jsonrpc_msg *request, *reply; + if (transaction->type != JSON_ARRAY + || !transaction->u.array.n + || transaction->u.array.elems[0]->type != JSON_STRING) { + ovs_fatal(0, "not a valid OVSDB query"); + } + const char *db_name = json_string(transaction->u.array.elems[0]); + + struct jsonrpc *rpc; + char *database = CONST_CAST(char *, db_name); + open_rpc(1, NEED_DATABASE, argc, argv, &rpc, &database); if (db_change_aware == 1) { send_db_change_aware(rpc); @@ -619,17 +877,17 @@ do_transact__(struct jsonrpc *rpc, struct json *transaction) } static void -do_transact(struct jsonrpc *rpc, const char *database OVS_UNUSED, - int argc OVS_UNUSED, char *argv[]) +do_transact(struct jsonrpc *rpc OVS_UNUSED, const char *database OVS_UNUSED, + int argc, char *argv[]) { - print_and_free_json(do_transact__(rpc, parse_json(argv[0]))); + print_and_free_json(do_transact__(argc, argv, parse_json(argv[argc - 1]))); } static void -do_query(struct jsonrpc *rpc, const char *database OVS_UNUSED, - int argc OVS_UNUSED, char *argv[]) +do_query(struct jsonrpc *rpc OVS_UNUSED, const char *database OVS_UNUSED, + int argc, char *argv[]) { - struct json *transaction = parse_json(argv[0]); + struct json *transaction = parse_json(argv[argc - 1]); if (transaction->type != JSON_ARRAY) { ovs_fatal(0, "not a valid OVSDB query"); @@ -642,7 +900,7 @@ do_query(struct jsonrpc *rpc, const char *database OVS_UNUSED, size_t abort_idx = transaction->u.array.n - 2; /* Run query. */ - struct json *result = do_transact__(rpc, transaction); + struct json *result = do_transact__(argc, argv, transaction); /* If the "abort" operation ended the transaction, remove its result. */ if (result->type == JSON_ARRAY @@ -1267,12 +1525,33 @@ do_monitor_cond(struct jsonrpc *rpc, const char *database, ovsdb_schema_destroy(schema); } +static bool +is_database_clustered(struct jsonrpc *rpc, const char *database) +{ + struct jsonrpc_msg *reply; + check_txn(jsonrpc_transact_block(rpc, + create_database_info_request(database), + &reply), &reply); + + const struct json *row = parse_database_info_reply( + reply, jsonrpc_get_name(rpc), database, NULL); + return !strcmp(parse_string_column(row, "model"), "clustered"); +} + static void -do_convert(struct jsonrpc *rpc, const char *database OVS_UNUSED, - int argc OVS_UNUSED, char *argv[]) +do_convert(struct jsonrpc *rpc, const char *database_ OVS_UNUSED, + int argc, char *argv[]) { + const char *schema_file_name = argv[argc - 1]; struct ovsdb_schema *new_schema; - check_ovsdb_error(ovsdb_schema_from_file(argv[0], &new_schema)); + check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &new_schema)); + + char *database = new_schema->name; + open_rpc(1, NEED_DATABASE, argc, argv, &rpc, &database); + + if (is_database_clustered(rpc, database)) { + ovsdb_schema_persist_ephemeral_columns(new_schema, schema_file_name); + } struct jsonrpc_msg *request, *reply; request = jsonrpc_create_request( @@ -1284,12 +1563,19 @@ do_convert(struct jsonrpc *rpc, const char *database OVS_UNUSED, } static void -do_needs_conversion(struct jsonrpc *rpc, const char *database OVS_UNUSED, +do_needs_conversion(struct jsonrpc *rpc, const char *database_ OVS_UNUSED, int argc OVS_UNUSED, char *argv[]) { struct ovsdb_schema *schema1; check_ovsdb_error(ovsdb_schema_from_file(argv[0], &schema1)); + char *database = schema1->name; + open_rpc(1, NEED_DATABASE, argc, argv, &rpc, &database); + + if (is_database_clustered(rpc, database)) { + ovsdb_schema_persist_ephemeral_columns(schema1, argv[0]); + } + struct ovsdb_schema *schema2 = fetch_schema(rpc, schema1->name); puts(ovsdb_schema_equal(schema1, schema2) ? "no" : "yes"); ovsdb_schema_destroy(schema1); @@ -1673,6 +1959,25 @@ do_backup(struct jsonrpc *rpc, const char *database, } static void +check_transaction_reply(struct jsonrpc_msg *reply) +{ + if (reply->result->type != JSON_ARRAY) { + ovs_fatal(0, "result is not array"); + } + for (size_t i = 0; i < json_array(reply->result)->n; i++) { + struct json *json = json_array(reply->result)->elems[i]; + if (json->type != JSON_OBJECT) { + ovs_fatal(0, "result array element is not object"); + } + struct shash *object = json_object(json); + if (shash_find(object, "error")) { + ovs_fatal(0, "server returned error reply: %s", + json_to_string(json, JSSF_SORT)); + } + } +} + +static void do_restore(struct jsonrpc *rpc, const char *database, int argc OVS_UNUSED, char *argv[] OVS_UNUSED) { @@ -1681,21 +1986,21 @@ do_restore(struct jsonrpc *rpc, const char *database, "please redirect stdin from a file"); } - struct ovsdb *backup; - check_ovsdb_error(ovsdb_file_open("/dev/stdin", true, &backup, NULL)); + struct ovsdb *backup = ovsdb_file_read("/dev/stdin", false); + ovsdb_storage_close(backup->storage); + backup->storage = NULL; - const struct ovsdb_schema *schema = backup->schema; - struct ovsdb_schema *schema2 = fetch_schema(rpc, database); - if (!ovsdb_schema_equal(schema, schema2)) { + struct ovsdb_schema *online_schema = fetch_schema(rpc, database); + if (!ovsdb_schema_equal(backup->schema, online_schema)) { struct ds s = DS_EMPTY_INITIALIZER; - if (strcmp(schema->version, schema2->version)) { + if (strcmp(backup->schema->version, online_schema->version)) { ds_put_format(&s, "backup schema has version \"%s\" but " "database schema has version \"%s\"", - schema->version, schema2->version); + backup->schema->version, online_schema->version); } else { ds_put_format(&s, "backup schema and database schema are " "both version %s but still differ", - schema->version); + backup->schema->version); } if (!force) { ovs_fatal(0, "%s (use --force to override differences, or " @@ -1707,7 +2012,7 @@ do_restore(struct jsonrpc *rpc, const char *database, } struct json *txn = json_array_create_empty(); - json_array_add(txn, json_string_create(schema->name)); + json_array_add(txn, json_string_create(backup->schema->name)); struct shash_node *node; SHASH_FOR_EACH (node, &backup->tables) { const char *table_name = node->name; @@ -1749,20 +2054,7 @@ do_restore(struct jsonrpc *rpc, const char *database, struct jsonrpc_msg *rq = jsonrpc_create_request("transact", txn, NULL); struct jsonrpc_msg *reply; check_txn(jsonrpc_transact_block(rpc, rq, &reply), &reply); - if (reply->result->type != JSON_ARRAY) { - ovs_fatal(0, "result is not array"); - } - for (size_t i = 0; i < json_array(reply->result)->n; i++) { - struct json *json = json_array(reply->result)->elems[i]; - if (json->type != JSON_OBJECT) { - ovs_fatal(0, "result array element is not object"); - } - struct shash *object = json_object(json); - if (shash_find(object, "error")) { - ovs_fatal(0, "server returned error reply: %s", - json_to_string(json, JSSF_SORT)); - } - } + check_transaction_reply(reply); jsonrpc_msg_destroy(reply); } @@ -1959,13 +2251,134 @@ do_lock_unlock(struct jsonrpc *rpc, const char *database OVS_UNUSED, do_lock(rpc, "unlock", argv[0]); } -/* All command handlers (except for "help") are expected to take an optional - * server socket name (e.g. "unix:...") as their first argument. The socket - * name argument must be included in max_args (but left out of min_args). The - * command name and socket name are not included in the arguments passed to the - * handler: the argv[0] passed to the handler is the first argument after the - * optional server socket name. The connection to the server is available as - * global variable 'rpc'. */ +enum ovsdb_client_wait_type { + WAIT_CONNECTED, + WAIT_ADDED, + WAIT_REMOVED +}; + +static struct jsonrpc_msg * +compose_wait_transaction(enum ovsdb_client_wait_type type, + const char *database) +{ + struct json *txn = json_array_create_empty(); + json_array_add(txn, json_string_create("_Server")); + + struct json *op = json_object_create(); + json_array_add(txn, op); + json_object_put_string(op, "op", "wait"); + json_object_put_string(op, "table", "Database"); + json_object_put(op, "where", + json_array_create_1( + json_array_create_3( + json_string_create("name"), + json_string_create("=="), + json_string_create(database)))); + + if (type == WAIT_CONNECTED) { + /* Wait until connected == true. */ + json_object_put(op, "columns", + json_array_create_1(json_string_create("connected"))); + json_object_put_string(op, "until", "=="); + + struct json *row = json_object_create(); + json_object_put(row, "connected", json_boolean_create(true)); + json_object_put(op, "rows", json_array_create_1(row)); + } else { + ovs_assert(type == WAIT_ADDED || type == WAIT_REMOVED); + + /* Wait until such a row exists, or not, respectively. */ + json_object_put(op, "columns", json_array_create_empty()); + json_object_put_string(op, "until", "=="); + json_object_put(op, "rows", + (type == WAIT_ADDED + ? json_array_create_1(json_object_create()) + : json_array_create_empty())); + } + return jsonrpc_create_request("transact", txn, NULL); +} + +static void +do_wait(struct jsonrpc *rpc_unused OVS_UNUSED, + const char *database_unused OVS_UNUSED, + int argc, char *argv[]) +{ + vlog_set_levels(NULL, VLF_CONSOLE, VLL_WARN); + vlog_set_levels_from_string_assert("reconnect:err"); + vlog_set_levels_from_string_assert("jsonrpc:err"); + + const char *database = argv[argc - 2]; + const char *state = argv[argc - 1]; + + enum ovsdb_client_wait_type type; + if (!strcmp(state, "connected")) { + type = WAIT_CONNECTED; + } else if (!strcmp(state, "added")) { + type = WAIT_ADDED; + } else if (!strcmp(state, "removed")) { + type = WAIT_REMOVED; + } else { + ovs_fatal(0, "%s: unknown state", state); + } + + char *remote = argc > 2 ? xstrdup(argv[0]) : default_remote(); + struct jsonrpc_session *js = jsonrpc_session_open(remote, true); + free(remote); + + unsigned int seqno = 0; + struct json *sdca_id = NULL; + struct json *txn_id = NULL; + for (;;) { + jsonrpc_session_run(js); + + if (seqno != jsonrpc_session_get_seqno(js) + && jsonrpc_session_is_connected(js)) { + seqno = jsonrpc_session_get_seqno(js); + + /* Send set_db_change_aware request. */ + struct jsonrpc_msg *rq = jsonrpc_create_request( + "set_db_change_aware", + json_array_create_1(json_boolean_create(true)), + NULL); + json_destroy(sdca_id); + sdca_id = json_clone(rq->id); + jsonrpc_session_send(js, rq); + + /* Send transaction. */ + rq = compose_wait_transaction(type, database); + json_destroy(txn_id); + txn_id = json_clone(rq->id); + jsonrpc_session_send(js, rq); + } + + struct jsonrpc_msg *reply = jsonrpc_session_recv(js); + if (reply && reply->id){ + if (sdca_id && json_equal(sdca_id, reply->id)) { + if (reply->type == JSONRPC_ERROR) { + ovs_fatal(0, "%s: set_db_change_aware failed (%s)", + jsonrpc_session_get_name(js), + json_to_string(reply->error, 0)); + } + } else if (txn_id && json_equal(txn_id, reply->id)) { + check_transaction_reply(reply); + exit(0); + } + } + jsonrpc_msg_destroy(reply); + + jsonrpc_session_recv_wait(js); + jsonrpc_session_wait(js); + poll_block(); + } +} + +/* Command handlers may take an optional server socket name (e.g. "unix:...") + * and an optional database name (e.g. Open_vSwitch) as their initial + * arguments. The NEED_* element indicates what a particular command needs. + * These optional arguments should not be included in min_args or max_args, and + * they are not included in the argc and argv arguments passed to the handler: + * the argv[0] passed to the handler is the first argument after the optional + * server socket name. */ static const struct ovsdb_client_command all_commands[] = { { "list-dbs", NEED_RPC, 0, 0, do_list_dbs }, { "get-schema", NEED_DATABASE, 0, 0, do_get_schema }, @@ -1973,12 +2386,13 @@ static const struct ovsdb_client_command all_commands[] = { { "get-schema-cksum", NEED_DATABASE, 0, 0, do_get_schema_cksum }, { "list-tables", NEED_DATABASE, 0, 0, do_list_tables }, { "list-columns", NEED_DATABASE, 0, 1, do_list_columns }, - { "transact", NEED_RPC, 1, 1, do_transact }, - { "query", NEED_RPC, 1, 1, do_query }, + { "transact", NEED_NONE, 1, 2, do_transact }, + { "query", NEED_NONE, 1, 2, do_query }, { "monitor", NEED_DATABASE, 1, INT_MAX, do_monitor }, { "monitor-cond", NEED_DATABASE, 2, 3, do_monitor_cond }, - { "convert", NEED_RPC, 1, 1, do_convert }, - { "needs-conversion", NEED_RPC, 1, 1, do_needs_conversion }, + { "wait", NEED_NONE, 2, 3, do_wait }, + { "convert", NEED_NONE, 1, 2, do_convert }, + { "needs-conversion", NEED_NONE, 1, 2, do_needs_conversion }, { "dump", NEED_DATABASE, 0, INT_MAX, do_dump }, { "backup", NEED_DATABASE, 0, 0, do_backup }, { "restore", NEED_DATABASE, 0, 0, do_restore }, diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in index dfca40d4ef79..51a2c2facc9e 100644 --- a/ovsdb/ovsdb-server.1.in +++ b/ovsdb/ovsdb-server.1.in @@ -40,12 +40,21 @@ see \fBovsdb\fR(7). Each OVSDB file may be specified on the command line as \fIdatabase\fR. If none is specified, the default is \fB@DBDIR@/conf.db\fR. The database files must already have been created and initialized using, for -example, \fBovsdb\-tool create\fR. +example, \fBovsdb\-tool\fR's \fBcreate\fR, \fBcreate\-cluster\fR, or +\fBjoin\-cluster\fR command. .PP -This OVSDB implementation supports standalone and active-backup -databases, as well as database replication. +This OVSDB implementation supports standalone, active-backup, and +clustered database service models, as well as database replication. See the Service Models section of \fBovsdb\fR(7) for more information. .PP +For clustered databases, when the \fB\-\-detach\fR option is used, +\fBovsdb\-server\fR detaches without waiting for the server to +successfully join a cluster (if the database file is freshly created +with \fBovsdb\-tool join\-cluster\fR) or connect to a cluster that it +has already joined. Use \fBovsdb\-client wait\fR (see +\fBovsdb\-client\fR(1)) to wait until the server has successfully +joined and connected to a cluster. +.PP In addition to user-specified databases, \fBovsdb\-server\fR version 2.9 and later also always hosts a built-in database named \fB_Server\fR. Please see \fBovsdb\-server\fR(5) for documentation on @@ -111,7 +120,10 @@ This option is not supported on Windows platform. .SS "Daemon Options" .ds DD \ \fBovsdb\-server\fR detaches only after it starts listening on all \ -configured remotes. +configured remotes. At this point, all standalone and active-backup \ +databases are ready for use. Clustered databases only become ready \ +for use after they finish joining their clusters (which could have \ +already happened in previous runs of \fBovsdb\-server\fR). .so lib/daemon.man .SS "Service Options" .so lib/service.man @@ -119,7 +131,9 @@ configured remotes. .so lib/vlog.man .SS "Active-Backup Options" These options support the \fBovsdb\-server\fR active-backup service -model and database replication. By +model and database replication. These options apply only to databases +in the format used for standalone and active-backup databases, which +is the database format created by \fBovsdb\-tool create\fR. By default, when it serves a database in this format, \fBovsdb\-server\fR runs as a standalone server. These options can configure it for active-backup use: @@ -293,6 +307,60 @@ When the connection is in \fIreplicating\fR state, further output shows the list of databases currently replicating, and the tables that are excluded. . +.SS "Cluster Commands" +These commands support the \fBovsdb\-server\fR clustered service model. +They apply only to databases in the format used for clustered databases, +which is the database format created by \fBovsdb\-tool create\-cluster\fR +and \fBovsdb\-tool join\-cluster\fR. +. +.IP "\fBcluster/cid \fIdb\fR" +Prints the cluster ID for \fIdb\fR, which is a UUID that identifies +the cluster. If \fIdb\fR is a database newly created by +\fBovsdb\-tool cluster\-join\fR, that has not yet successfully joined +its cluster, and \fB\-\-cid\fR was not specified on the +\fBcluster\-join\fR command line, then this command will report an +error because the cluster ID is not yet known. +. +.IP "\fBcluster/sid \fIdb\fR" +Prints the server ID for \fIdb\fR, which is a UUID that identifies +this server within the cluster. +. +.IP "\fBcluster/status \fIdb\fR" +Prints this server's status within the cluster and the status of its +connections to other servers in the cluster. +. +.IP "\fBcluster/leave \fR[\fB\-\-force\fR] \fIdb\fR" +.IP +Without \fB\-\-force\fR, this command starts the server gracefully +removing itself from its cluster. At least one server must remain, +and the cluster must be healthy, that is, over half its servers are +up. +.IP +With \fB\-\-force\fR, this command forces the server to leave its +cluster and form a new single-node cluster that contains only itself. +The data in the new cluster may be inconsistent with the former +cluster: transactions not yet replicated to the server will be lost, +and transactions not yet applied to the cluster may be committed. +Afterward, any servers in its former cluster will regard the server to +have failed. +.IP +When the server successfully leaves the cluster, it stops serving +\fIdb\fR, as if \fBovsdb\-server/remove\-db \fIdb\fR had been +executed. +.IP +Use \fBovsdb\-client wait\fR (see \fBovsdb\-client\fR(1)) to wait +until the server has left the cluster. +. +.IP "\fBcluster/kick \fIdb server\fR" +Start graceful removal of \fIserver\fR from \fIdb\fR's cluster, like +\fBcluster/leave\fR (without \fB\-\-force\fR) except that it can +remove any server, not just this one. +.IP +\fIserver\fR may be a server ID, as printed by \fBcluster/sid\fR, or +the server's local network address as passed to \fBovsdb-tool\fR's +\fBcreate\-cluster\fR or \fBjoin\-cluster\fR command. Use +\fBcluster/status\fR to see a list of cluster members. +. .so lib/vlog-unixctl.man .so lib/memory-unixctl.man .so lib/coverage-unixctl.man diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index f7bf1e270120..68584f396d10 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -49,6 +49,7 @@ #include "stream-ssl.h" #include "stream.h" #include "sset.h" +#include "storage.h" #include "table.h" #include "timeval.h" #include "transaction.h" @@ -63,7 +64,6 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_server); struct db { char *filename; - struct ovsdb_file *file; struct ovsdb *db; struct uuid row_uuid; }; @@ -107,9 +107,13 @@ static unixctl_cb_func ovsdb_server_add_database; static unixctl_cb_func ovsdb_server_remove_database; static unixctl_cb_func ovsdb_server_list_databases; -static char *open_db(struct server_config *config, const char *filename); +static void read_db(struct server_config *, struct db *); +static struct ovsdb_error *open_db(struct server_config *, + const char *filename) + OVS_WARN_UNUSED_RESULT; static void add_server_db(struct server_config *); -static void close_db(struct db *db); +static void remove_db(struct server_config *, struct shash_node *db, char *); +static void close_db(struct server_config *, struct db *, char *); static void parse_options(int argc, char *argvp[], struct sset *db_filenames, struct sset *remotes, @@ -153,7 +157,18 @@ ovsdb_replication_init(const char *sync_from, const char *exclude, } static void -main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, +log_and_free_error(struct ovsdb_error *error) +{ + if (error) { + char *s = ovsdb_error_to_string_free(error); + VLOG_INFO("%s", s); + free(s); + } +} + +static void +main_loop(struct server_config *config, + struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, struct unixctl_server *unixctl, struct sset *remotes, struct process *run_process, bool *exiting, bool *is_backup) { @@ -201,10 +216,25 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, } } - SHASH_FOR_EACH(node, all_dbs) { + struct shash_node *next; + SHASH_FOR_EACH_SAFE (node, next, all_dbs) { struct db *db = node->data; if (ovsdb_trigger_run(db->db, time_msec())) { - ovsdb_jsonrpc_server_reconnect(jsonrpc, false); + ovsdb_jsonrpc_server_reconnect( + jsonrpc, false, + xasprintf("committed %s database schema conversion", + db->db->name)); + } + ovsdb_storage_run(db->db->storage); + read_db(config, db); + if (ovsdb_storage_is_dead(db->db->storage)) { + VLOG_INFO("%s: removing database because storage disconnected " + "permanently", node->name); + remove_db(config, node, + xasprintf("removing database %s because storage " + "disconnected permanently", node->name)); + } else if (ovsdb_storage_should_snapshot(db->db->storage)) { + log_and_free_error(ovsdb_snapshot(db->db)); } } if (run_process) { @@ -232,6 +262,8 @@ main_loop(struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, SHASH_FOR_EACH(node, all_dbs) { struct db *db = node->data; ovsdb_trigger_wait(db->db, time_msec()); + ovsdb_storage_wait(db->db->storage); + ovsdb_storage_read_wait(db->db->storage); } if (run_process) { process_wait(run_process); @@ -267,7 +299,6 @@ main(int argc, char *argv[]) struct server_config server_config; struct shash all_dbs; struct shash_node *node, *next; - char *error; ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); @@ -319,14 +350,15 @@ main(int argc, char *argv[]) perf_counters_init(); SSET_FOR_EACH (db_filename, &db_filenames) { - error = open_db(&server_config, db_filename); + struct ovsdb_error *error = open_db(&server_config, db_filename); if (error) { - ovs_fatal(0, "%s", error); + char *s = ovsdb_error_to_string_free(error); + ovs_fatal(0, "%s", s); } } add_server_db(&server_config); - error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); + char *error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); if (!error) { error = reconfigure_ssl(&all_dbs); } @@ -420,15 +452,17 @@ main(int argc, char *argv[]) ovsdb_replication_init(sync_from, sync_exclude, &all_dbs, server_uuid); } - main_loop(jsonrpc, &all_dbs, unixctl, &remotes, run_process, &exiting, - &is_backup); + main_loop(&server_config, jsonrpc, &all_dbs, unixctl, &remotes, + run_process, &exiting, &is_backup); - ovsdb_jsonrpc_server_destroy(jsonrpc); SHASH_FOR_EACH_SAFE(node, next, &all_dbs) { struct db *db = node->data; - close_db(db); + close_db(&server_config, db, + xasprintf("removing %s database due to server termination", + db->db->name)); shash_delete(&all_dbs, node); } + ovsdb_jsonrpc_server_destroy(jsonrpc); shash_destroy(&all_dbs); sset_destroy(&remotes); sset_destroy(&db_filenames); @@ -480,55 +514,171 @@ is_already_open(struct server_config *config OVS_UNUSED, } static void -close_db(struct db *db) +close_db(struct server_config *config, struct db *db, char *comment) +{ + if (db) { + ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db, comment); + ovsdb_destroy(db->db); + free(db->filename); + free(db); + } else { + free(comment); + } +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +parse_txn(struct server_config *config, struct db *db, + struct ovsdb_schema *schema, const struct json *txn_json, + const struct uuid *txnid) { - ovsdb_destroy(db->db); - free(db->filename); - free(db); + if (schema) { + /* We're replacing the schema (and the data). Destroy the database + * (first grabbing its storage), then replace it with the new schema. + * The transaction must also include the replacement data. */ + ovs_assert(txn_json); + ovs_assert(ovsdb_storage_is_clustered(db->db->storage)); + + struct ovsdb_error *error = ovsdb_schema_check_for_ephemeral_columns( + schema); + if (error) { + return error; + } + + ovsdb_jsonrpc_server_remove_db( + config->jsonrpc, db->db, + (db->db->schema + ? xasprintf("database %s schema changed", db->db->name) + : xasprintf("database %s connected to storage", db->db->name))); + struct ovsdb_storage *storage = db->db->storage; + db->db->storage = NULL; + ovsdb_destroy(db->db); + + db->db = ovsdb_create(schema, storage); + ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); + + /* Force update to schema in _Server database. */ + db->row_uuid = UUID_ZERO; + } + + if (txn_json) { + if (!db->db->schema) { + return ovsdb_error(NULL, "%s: data without schema", db->filename); + } + + struct ovsdb_txn *txn; + struct ovsdb_error *error; + + error = ovsdb_file_txn_from_json(db->db, txn_json, false, &txn); + if (!error) { + log_and_free_error(ovsdb_txn_replay_commit(txn)); + } + if (!error && !uuid_is_zero(txnid)) { + db->db->prereq = *txnid; + } + if (error) { + ovsdb_storage_unread(db->db->storage); + return error; + } + } + + return NULL; +} + +static void +read_db(struct server_config *config, struct db *db) +{ + struct ovsdb_error *error; + for (;;) { + struct ovsdb_schema *schema; + struct json *txn_json; + struct uuid txnid; + error = ovsdb_storage_read(db->db->storage, &schema, &txn_json, + &txnid); + if (error) { + break; + } else if (!schema && !txn_json) { + /* End of file. */ + return; + } else { + error = parse_txn(config, db, schema, txn_json, &txnid); + json_destroy(txn_json); + if (error) { + break; + } + } + } + + /* Log error but otherwise ignore it. Probably the database just + * got truncated due to power failure etc. and we should use its + * current contents. */ + char *msg = ovsdb_error_to_string_free(error); + VLOG_ERR("%s", msg); + free(msg); } static void -add_db(struct server_config *config, const char *name, struct db *db) +add_db(struct server_config *config, struct db *db) { db->row_uuid = UUID_ZERO; - shash_add_assert(config->all_dbs, name, db); - bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, - db->db); - ovs_assert(ok); + shash_add_assert(config->all_dbs, db->db->name, db); } -static char * +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT open_db(struct server_config *config, const char *filename) { - struct ovsdb_error *db_error; struct db *db; - char *error; /* If we know that the file is already open, return a good error message. * Otherwise, if the file is open, we'll fail later on with a harder to * interpret file locking error. */ if (is_already_open(config, filename)) { - return xasprintf("%s: already open", filename); + return ovsdb_error(NULL, "%s: already open", filename); + } + + struct ovsdb_storage *storage; + struct ovsdb_error *error; + error = ovsdb_storage_open(filename, true, &storage); + if (error) { + return error; } db = xzalloc(sizeof *db); db->filename = xstrdup(filename); - db_error = ovsdb_file_open(db->filename, false, &db->db, &db->file); - if (db_error) { - error = ovsdb_error_to_string_free(db_error); - } else if (db->db->schema->name[0] == '_') { - error = xasprintf("%s: names beginning with \"_\" are reserved", - db->db->schema->name); - } else if (!ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db)) { - error = xasprintf("%s: duplicate database name", db->db->schema->name); + struct ovsdb_schema *schema; + if (ovsdb_storage_is_clustered(storage)) { + schema = NULL; } else { - shash_add_assert(config->all_dbs, db->db->schema->name, db); - return NULL; + struct json *txn_json; + error = ovsdb_storage_read(storage, &schema, &txn_json, NULL); + if (error) { + ovsdb_storage_close(storage); + return error; + } + ovs_assert(schema && !txn_json); + } + db->db = ovsdb_create(schema, storage); + ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); + + read_db(config, db); + + error = (db->db->name[0] == '_' + ? ovsdb_error(NULL, "%s: names beginning with \"_\" are reserved", + db->db->name) + : shash_find(config->all_dbs, db->db->name) + ? ovsdb_error(NULL, "%s: duplicate database name", db->db->name) + : NULL); + if (error) { + char *error_s = ovsdb_error_to_string(error); + close_db(config, db, + xasprintf("cannot complete opening %s database (%s)", + db->db->name, error_s)); + free(error_s); + return error; } - close_db(db); - return error; + add_db(config, db); + return NULL; } /* Add the internal _Server database to the server configuration. */ @@ -548,8 +698,10 @@ add_server_db(struct server_config *config) struct db *db = xzalloc(sizeof *db); db->filename = xstrdup(""); - db->db = ovsdb_create(schema); - add_db(config, db->db->schema->name, db); + db->db = ovsdb_create(schema, ovsdb_storage_create_unbacked()); + bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); + ovs_assert(ok); + add_db(config, db); } static char * OVS_WARN_UNUSED_RESULT @@ -560,11 +712,8 @@ parse_db_column__(const struct shash *all_dbs, const struct ovsdb_column **columnp) { const char *db_name, *table_name, *column_name; - const struct ovsdb_column *column; - const struct ovsdb_table *table; const char *tokens[3]; char *save_ptr = NULL; - const struct db *db; *dbp = NULL; *tablep = NULL; @@ -582,25 +731,22 @@ parse_db_column__(const struct shash *all_dbs, table_name = tokens[1]; column_name = tokens[2]; - db = shash_find_data(all_dbs, tokens[0]); - if (!db) { + *dbp = shash_find_data(all_dbs, tokens[0]); + if (!*dbp) { return xasprintf("\"%s\": no database named %s", name_, db_name); } - table = ovsdb_get_table(db->db, table_name); - if (!table) { + *tablep = ovsdb_get_table((*dbp)->db, table_name); + if (!*tablep) { return xasprintf("\"%s\": no table named %s", name_, table_name); } - column = ovsdb_table_schema_get_column(table->schema, column_name); - if (!column) { + *columnp = ovsdb_table_schema_get_column((*tablep)->schema, column_name); + if (!*columnp) { return xasprintf("\"%s\": table \"%s\" has no column \"%s\"", name_, table_name, column_name); } - *dbp = db; - *columnp = column; - *tablep = table; return NULL; } @@ -662,7 +808,13 @@ query_db_string(const struct shash *all_dbs, const char *name, retval = parse_db_string_column(all_dbs, name, &db, &table, &column); if (retval) { - ds_put_format(errors, "%s\n", retval); + if (db && !db->db->schema) { + /* 'db' is a clustered database but it hasn't connected to the + * cluster yet, so we can't get anything out of it, not even a + * schema. Not really an error. */ + } else { + ds_put_format(errors, "%s\n", retval); + } free(retval); return NULL; } @@ -768,7 +920,13 @@ query_db_remotes(const char *name, const struct shash *all_dbs, retval = parse_db_column(all_dbs, name, &db, &table, &column); if (retval) { - ds_put_format(errors, "%s\n", retval); + if (db && !db->db->schema) { + /* 'db' is a clustered database but it hasn't connected to the + * cluster yet, so we can't get anything out of it, not even a + * schema. Not really an error. */ + } else { + ds_put_format(errors, "%s\n", retval); + } free(retval); return; } @@ -918,7 +1076,7 @@ update_remote_rows(const struct shash *all_dbs, const struct db *db_, static void commit_txn(struct ovsdb_txn *txn, const char *name) { - struct ovsdb_error *error = ovsdb_txn_commit(txn, false); + struct ovsdb_error *error = ovsdb_txn_propose_commit_block(txn, false); if (error) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); char *msg = ovsdb_error_to_string_free(error); @@ -935,15 +1093,16 @@ update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, struct shash_node *node; SHASH_FOR_EACH (node, all_dbs) { struct db *db = node->data; - struct ovsdb_txn *txn = ovsdb_txn_create(db->db); + if (!db->db || ovsdb_storage_is_clustered(db->db->storage)) { + continue; + } - /* Iterate over --remote arguments given on command line. */ + struct ovsdb_txn *txn = ovsdb_txn_create(db->db); const char *remote; SSET_FOR_EACH (remote, remotes) { update_remote_rows(all_dbs, db, remote, jsonrpc, txn); } - - commit_txn(txn, node->name); + commit_txn(txn, "remote status"); } } @@ -952,7 +1111,24 @@ update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, static void update_database_status(struct ovsdb_row *row, struct db *db) { - ovsdb_util_write_string_column(row, "name", db->db->schema->name); + ovsdb_util_write_string_column(row, "name", db->db->name); + ovsdb_util_write_string_column(row, "model", + ovsdb_storage_get_model(db->db->storage)); + ovsdb_util_write_bool_column(row, "connected", + ovsdb_storage_is_connected(db->db->storage)); + ovsdb_util_write_bool_column(row, "leader", + ovsdb_storage_is_leader(db->db->storage)); + ovsdb_util_write_uuid_column(row, "cid", + ovsdb_storage_get_cid(db->db->storage)); + ovsdb_util_write_uuid_column(row, "sid", + ovsdb_storage_get_sid(db->db->storage)); + + uint64_t index = ovsdb_storage_get_applied_index(db->db->storage); + if (index) { + ovsdb_util_write_integer_column(row, "index", index); + } else { + ovsdb_util_clear_column(row, "index"); + } const struct uuid *row_uuid = ovsdb_row_get_uuid(row); if (!uuid_equals(row_uuid, &db->row_uuid)) { @@ -961,11 +1137,14 @@ update_database_status(struct ovsdb_row *row, struct db *db) /* The schema can only change if the row UUID changes, so only update * it in that case. Presumably, this is worth optimizing because * schemas are often kilobytes in size and nontrivial to serialize. */ - struct json *json_schema = ovsdb_schema_to_json(db->db->schema); - char *schema = json_to_string(json_schema, JSSF_SORT); + char *schema = NULL; + if (db->db->schema) { + struct json *json_schema = ovsdb_schema_to_json(db->db->schema); + schema = json_to_string(json_schema, JSSF_SORT); + json_destroy(json_schema); + } ovsdb_util_write_string_column(row, "schema", schema); free(schema); - json_destroy(json_schema); } } @@ -1231,7 +1410,8 @@ ovsdb_server_disable_monitor_cond(struct unixctl_conn *conn, struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; ovsdb_jsonrpc_disable_monitor_cond(); - ovsdb_jsonrpc_server_reconnect(jsonrpc, true); + ovsdb_jsonrpc_server_reconnect( + jsonrpc, true, xstrdup("user ran ovsdb-server/disable-monitor")); unixctl_command_reply(conn, NULL); } @@ -1242,7 +1422,6 @@ ovsdb_server_compact(struct unixctl_conn *conn, int argc, const char *db_name = argc < 2 ? NULL : argv[1]; struct shash *all_dbs = dbs_; struct ds reply; - struct db *db; struct shash_node *node; int n = 0; @@ -1253,22 +1432,24 @@ ovsdb_server_compact(struct unixctl_conn *conn, int argc, ds_init(&reply); SHASH_FOR_EACH(node, all_dbs) { - db = node->data; + struct db *db = node->data; if (db_name ? !strcmp(node->name, db_name) : node->name[0] != '_') { - struct ovsdb_error *error; - - VLOG_INFO("compacting %s database by user request", node->name); + if (db->db) { + VLOG_INFO("compacting %s database by user request", + node->name); + + struct ovsdb_error *error = ovsdb_snapshot(db->db); + if (error) { + char *s = ovsdb_error_to_string(error); + ds_put_format(&reply, "%s\n", s); + free(s); + ovsdb_error_destroy(error); + } - error = ovsdb_file_compact(db->file); - if (error) { - char *s = ovsdb_error_to_string_free(error); - ds_put_format(&reply, "%s\n", s); - free(s); + n++; } - - n++; } } @@ -1289,7 +1470,8 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *jsonrpc_) { struct ovsdb_jsonrpc_server *jsonrpc = jsonrpc_; - ovsdb_jsonrpc_server_reconnect(jsonrpc, true); + ovsdb_jsonrpc_server_reconnect( + jsonrpc, true, xstrdup("user ran ovsdb-server/reconnect")); unixctl_command_reply(conn, NULL); } @@ -1370,9 +1552,8 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, { struct server_config *config = config_; const char *filename = argv[1]; - char *error; - error = open_db(config, filename); + char *error = ovsdb_error_to_string_free(open_db(config, filename)); if (!error) { save_config(config); if (*config->is_backup) { @@ -1389,13 +1570,11 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, } static void -remove_db(struct server_config *config, struct shash_node *node) +remove_db(struct server_config *config, struct shash_node *node, char *comment) { struct db *db = node->data; - ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db); - - close_db(db); + close_db(config, db, comment); shash_delete(config->all_dbs, node); save_config(config); @@ -1424,7 +1603,8 @@ ovsdb_server_remove_database(struct unixctl_conn *conn, int argc OVS_UNUSED, return; } - remove_db(config, node); + remove_db(config, node, xasprintf("removing %s database by user request", + node->name)); unixctl_command_reply(conn, NULL); } @@ -1441,7 +1621,11 @@ ovsdb_server_list_databases(struct unixctl_conn *conn, int argc OVS_UNUSED, nodes = shash_sort(all_dbs); for (i = 0; i < shash_count(all_dbs); i++) { - ds_put_format(&s, "%s\n", nodes[i]->name); + const struct shash_node *node = nodes[i]; + struct db *db = node->data; + if (db->db) { + ds_put_format(&s, "%s\n", node->name); + } } free(nodes); diff --git a/ovsdb/ovsdb-tool.1.in b/ovsdb/ovsdb-tool.1.in index 7b89ffeec8bf..3efa6a3e7032 100644 --- a/ovsdb/ovsdb-tool.1.in +++ b/ovsdb/ovsdb-tool.1.in @@ -15,6 +15,9 @@ ovsdb\-tool \- Open vSwitch database management utility .IP "Database Creation Commands:" \fBovsdb\-tool \fR[\fIoptions\fR] \fBcreate \fR[\fIdb\fR [\fIschema\fR]] .br +\fBovsdb\-tool \fR[\fIoptions\fR] \fBcreate\-cluster \fIdb contents address\fR +.br +\fBovsdb\-tool [\fB\-\-cid=\fIuuid\fR] \fBjoin\-cluster\fI db name local remote\fR... .IP "Version Management Commands:" \fBovsdb\-tool \fR[\fIoptions\fR] \fBconvert \fR[\fIdb\fR [\fIschema \fR[\fItarget\fR]]] @@ -37,6 +40,8 @@ ovsdb\-tool \- Open vSwitch database management utility .br \fBovsdb\-tool \fR[\fIoptions\fR] [\fB\-m\fR | \fB\-\-more\fR]... \fBshow\-log \fR[\fIdb\fR] .br +\fBovsdb\-tool \fR[\fIoptions\fR] \fBcheck\-cluster \fIdb\fR... +.br \fBovsdb\-tool \fR[\fIoptions\fR] \fBdb\-name \fR[\fIdb\fR] .br \fBovsdb\-tool \fR[\fIoptions\fR] \fBschema\-name \fR[\fIschema\fR] @@ -54,23 +59,85 @@ For an introduction to OVSDB and its implementation in Open vSwitch, see \fBovsdb\fR(7). .PP This OVSDB implementation supports standalone and active-backup -database service models with a common on-disk format For a -specification of this format, see \fBovsdb\fR(5). For more +database service models with one on-disk format and a clustered +database service model with a different format. \fBovsdb\-tool\fR +supports both formats, but some commands are appropriate for only one +format, as documented for individual commands below. For a +specification of these formats, see \fBovsdb\fR(5). For more information on OVSDB service models, see the \fBService Models\fR section in \fBovsdb\fR(7). . .SS "Database Creation Commands" -This command creates a new OVSDB database file. -It will not overwrite an existing database file. To +These commands create a new OVSDB database file. +They will not overwrite an existing database file. To replace an existing database with a new one, first delete the old one. . -.IP "\fBcreate\fI db schema\fR" +.IP "\fBcreate\fI [db [schema]]\fR" Use this command to create the database for controlling \fBovs\-vswitchd\fR or another standalone or active-backup database. It creates database file \fIdb\fR with the given \fIschema\fR, which must be the name of a file that contains an OVSDB schema in JSON format, as specified in the OVSDB specification. The new database is -initially empty. +initially empty. (You can use \fBcp\fR to copy a database including +both its schema and data.) +. +.IP "\fBcreate\-cluster\fI db contents local" +Use this command to initialize the first server in a high-availability +cluster of 3 (or more) database servers, e.g. for an OVN northbound or +southbound database in an environment that cannot tolerate a single +point of failure. It creates clustered database file \fIdb\fR and +configures the server to listen on \fIlocal\fR, which must take the +form \fIprotocol\fB:\fIip\fB:\fIport\fR, where \fIprotocol\fR is +\fBtcp\fR or \fBssl\fR, \fIip\fR is the server's IP (either an IPv4 +address or an IPv6 address enclosed in square brackets), and +\fIport\fR is a TCP port number. Only one address is specified, for +the first server in the cluster, ordinarily the one for the server +running \fBcreate\-cluster\fR. The address is used for communication +within the cluster, not for communicating with OVSDB clients, and must +not use the same port used for the OVSDB protocol. +.IP +The new database is initialized with \fIcontents\fR, which must name a +file that contains either an OVSDB schema in JSON format or a +standalone OVSDB database. If it is a schema file, the new database +will initially be empty, with the given schema. If it is a database +file, the new database will have the same schema and contents. +. +.IP "[\fB\-\-cid=\fIuuid\fR] \fBjoin\-cluster\fI db name local remote\fR..." +Use this command to initialize each server after the first one in an +OVSDB high-availability cluster. It creates clustered database file +\fIdb\fR for a database named \fIname\fR, and +configures the server to listen on \fIlocal\fR and to initially +connect to \fIremote\fR, which must be a server that already belongs +to the cluster. \fIlocal\fR and \fIremote\fR use the same +\fIprotocol\fB:\fIip\fB:\fIport\fR syntax as \fBcreate\-cluster\fR. +.IP +The \fIname\fR must be the name of the schema or database passed to +\fBcreate\-cluster\fR. For example, the name of the OVN Southbound +database schema is \fBOVN_Southbound\fR. Use \fBovsdb\-tool\fR's +\fBschema\-name\fR or \fBdb\-name\fR command to find out the name of a +schema or database, respectively. +.IP +This command does not do any network access, which means that it +cannot actually join the new server to the cluster. Instead, the +\fIdb\fR file that it creates prepares the server to join the cluster +the first time that \fBovsdb\-server\fR serves it. As part of joining +the cluster, the new server retrieves the database schema and obtains +the list of all cluster members. Only after that does it become a +full member of the cluster. +.IP +Optionally, more than one \fIremote\fR may be specified; for example, +in a cluster that already contains multiple servers, one could specify +all the existing servers. This is beneficial if some of the existing +servers are down while the new server joins, but it is not otherwise +needed. +.IP +By default, the \fIdb\fR created by \fBjoin\-cluster\fR will join any +clustered database named \fIname\fR that is available at a +\fIremote\fR. In theory, if machines go up and down and IP addresses +change in the right way, it could join the wrong database cluster. To +avoid this possibility, specify \fB\-\-cid=\fIuuid\fR, where +\fIuuid\fR is the cluster ID of the cluster to join, as printed by +\fBovsdb\-tool get\-cid\fR. . .SS "Version Management Commands" .so ovsdb/ovsdb-schemas.man @@ -102,11 +169,19 @@ example, converting a database from a schema that has a given column or table to one that does not will delete all data in that column or table. Back up critical databases before converting them. .IP +This command is for standalone and active-backup databases only. For +clustered databases, use \fBovsdb\-client\fR's \fBconvert\fR command +to convert them online. +. .IP "\fBneeds\-conversion\fI db schema\fR" Reads the schema embedded in \fIdb\fR and the JSON schema from \fIschema\fR and compares them. If the schemas are the same, prints \fBno\fR on stdout; if they differ, prints \fByes\fR. .IP +This command is for standalone and active-backup databases only. For +clustered databases, use \fBovsdb\-client\fR's \fBneeds-conversion\fR +command instead. +. .IP "\fBdb\-version\fI db\fR" .IQ "\fBschema\-version\fI schema\fR" Prints the version number in the schema embedded within the database @@ -115,6 +190,10 @@ If \fIschema\fR or \fIdb\fR was created before schema versioning was introduced, then it will not have a version number and this command will print a blank line. .IP +The \fBschema\-version\fR command is for standalone and active-backup +databases only. For clustered databases, use \fBovsdb\-client\fR's +\fBschema\-version\fR command instead. +. .IP "\fBdb\-cksum\fI db\fR" .IQ "\fBschema\-cksum\fI schema\fR" Prints the checksum in the schema embedded within the database @@ -123,6 +202,10 @@ If \fIschema\fR or \fIdb\fR was created before schema checksums were introduced, then it will not have a checksum and this command will print a blank line. .IP +The \fBschema\-cksum\fR command is for standalone and active-backup +databases only. For clustered databases, use \fBovsdb\-client\fR's +\fBschema\-cksum\fR command instead. +. .SS "Other Commands" . .IP "\fBcompact\fI db \fR[\fItarget\fR]" @@ -136,8 +219,10 @@ database that grows much larger than its minimum size. .IP This command does not work if \fIdb\fR is currently being served by \fBovsdb\-server\fR, or if it is otherwise locked for writing by -another process. Instead, send the \fBovsdb\-server/compact\fR -command to \fBovsdb\-server\fR, via \fBovs\-appctl\fR). +another process. This command also does not work with clustered +databases. Instead, in either case, send the +\fBovsdb\-server/compact\fR command to \fBovsdb\-server\fR, via +\fBovs\-appctl\fR). . .IP "[\fB\-\-rbac\-role=\fIrole\fR] \fBquery\fI db transaction\fR" Opens \fIdb\fR, executes \fItransaction\fR on it, and prints the @@ -153,6 +238,10 @@ may specify database modifications, but these will have no effect on .IP By default, the transaction is executed using the ``superuser'' RBAC role. Use \fB\-\-rbac\-role\fR to specify a different role. +.IP +This command does not work with clustered databases. Instead, use +\fBovsdb-client\fR's \fBquery\fR command to send the query to +\fBovsdb\-server\fR. . .IP "[\fR\-\-rbac\-role=\fIrole\fR] \fBtransact\fI db transaction\fR" Opens \fIdb\fR, executes \fItransaction\fR on it, prints the results, @@ -162,8 +251,9 @@ JSON array in the format of the \fBparams\fR array for the JSON-RPC .IP This command does not work if \fIdb\fR is currently being served by \fBovsdb\-server\fR, or if it is otherwise locked for writing by -another process. Instead, use \fBovsdb\-client\fR's \fBtransact\fR -command to send the query to \fBovsdb\-server\fR. +another process. This command also does not work with clustered +databases. Instead, in either case, use \fBovsdb\-client\fR's +\fBtransact\fR command to send the query to \fBovsdb\-server\fR. .IP By default, the transaction is executed using the ``superuser'' RBAC role. Use \fB\-\-rbac\-role\fR to specify a different role. @@ -179,12 +269,47 @@ one or more times to the command line. With one \fB\-m\fR, modified by each transaction. With two \fB\-m\fRs, \fBshow\-log\fR also prints the values of the columns modified by each change to a record. +.IP +This command works with standalone and active-backup databases and +with clustered databases, but the output formats are different. +. +.IP "\fBcheck\-cluster \fIdb\fR..." +Reads all of the records in the supplied database logs, which must be +logs collected from different servers (and ideally all the servers) in +a single cluster. Checks each log for self-consistency and the set of +logs together for cross-consistency. If \fBovsdb\-tool\fR detects +unusual but not necessary incorrect content, it prints a warning or +warnings on stdout. If \fBovsdb\-tool\fR find consistency errors, it +prints an error on stderr and exits with status 1. Errors typically +indicate bugs in \fBovsdb\-server\fR; please consider reporting them +to the Open vSwitch developers. . .IP "\fBdb\-name \fR[\fIdb\fR]" .IQ "\fBschema\-name \fR[\fIschema\fR]" Prints the name of the schema embedded within the database \fIdb\fR or in the JSON schema \fIschema\fR on stdout. . +.IP "\fBdb\-cid\fI db\fR" +Prints the cluster ID, which is a UUID that identifies the cluster, +for \fIdb\fR. If \fIdb\fR is a database newly created by +\fBovsdb\-tool cluster\-join\fR, that has not yet successfully joined +its cluster, and \fB\-\-cid\fR was not specified on the +\fBcluster\-join\fR command line, then this command will output an +error, and exit with status 2, because the cluster ID is not yet +known. This command works only with clustered databases. +.IP +The all-zeros UUID is not a valid cluster ID. +. +.IP "\fBdb\-sid\fI db\fR" +Prints the server ID, which is a UUID that identifies the server, for +\fIdb\fR. This command works only with clustered databases. It works +regardless of whether \fIdb\fR has joined the cluster. +. +.IP "\fBdb\-local\-address db\fR" +Prints the address used for database clustering for \fIdb\fR, in the +same \fIprotocol\fB:\fIip\fB:\fIport\fR form used on +\fBcreate\-cluster\fR and \fBjoin\-cluster\fR. +. .SH OPTIONS .SS "Logging Options" .so lib/vlog.man diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index cec64152f079..bef40e81e62b 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -29,15 +29,22 @@ #include "openvswitch/dynamic-string.h" #include "fatal-signal.h" #include "file.h" +#include "hash.h" #include "lockfile.h" #include "log.h" +#include "openvswitch/hmap.h" #include "openvswitch/json.h" #include "ovsdb.h" #include "ovsdb-data.h" #include "ovsdb-error.h" +#include "ovsdb-parser.h" +#include "raft.h" +#include "raft-private.h" #include "socket-util.h" +#include "storage.h" #include "table.h" #include "timeval.h" +#include "transaction.h" #include "util.h" #include "openvswitch/vlog.h" @@ -47,6 +54,9 @@ static int show_log_verbosity; /* --role: RBAC role to use for "transact" and "query" commands. */ static const char *rbac_role; +/* --cid: Cluster ID for "join-cluster" command. */ +static struct uuid cid; + static const struct ovs_cmdl_command *get_all_commands(void); OVS_NO_RETURN static void usage(void); @@ -62,6 +72,7 @@ main(int argc, char *argv[]) set_program_name(argv[0]); parse_options(argc, argv); fatal_ignore_sigpipe(); + fatal_signal_init(); ctx.argc = argc - optind; ctx.argv = argv + optind; ovs_cmdl_run_command(&ctx, get_all_commands()); @@ -72,11 +83,13 @@ static void parse_options(int argc, char *argv[]) { enum { - OPT_RBAC_ROLE = UCHAR_MAX + 1 + OPT_RBAC_ROLE = UCHAR_MAX + 1, + OPT_CID }; static const struct option long_options[] = { {"more", no_argument, NULL, 'm'}, {"rbac-role", required_argument, NULL, OPT_RBAC_ROLE}, + {"cid", required_argument, NULL, OPT_CID}, {"verbose", optional_argument, NULL, 'v'}, {"help", no_argument, NULL, 'h'}, {"option", no_argument, NULL, 'o'}, @@ -102,6 +115,12 @@ parse_options(int argc, char *argv[]) rbac_role = optarg; break; + case OPT_CID: + if (!uuid_from_string(&cid, optarg) || uuid_is_zero(&cid)) { + ovs_fatal(0, "%s: not a valid UUID", optarg); + } + break; + case 'h': usage(); @@ -133,11 +152,18 @@ usage(void) printf("%s: Open vSwitch database management utility\n" "usage: %s [OPTIONS] COMMAND [ARG...]\n" " create [DB [SCHEMA]] create DB with the given SCHEMA\n" + " create-cluster DB CONTENTS LOCAL\n" + " create clustered DB with given CONTENTS and LOCAL address\n" + " [--cid=UUID] join-cluster DB NAME LOCAL REMOTE...\n" + " join clustered DB with given NAME and LOCAL and REMOTE addrs\n" " compact [DB [DST]] compact DB in-place (or to DST)\n" " convert [DB [SCHEMA [DST]]] convert DB to SCHEMA (to DST)\n" " db-name [DB] report name of schema used by DB\n" " db-version [DB] report version of schema used by DB\n" " db-cksum [DB] report checksum of schema used by DB\n" + " db-cid DB report cluster ID of clustered DB\n" + " db-sid DB report server ID of clustered DB\n" + " db-local-address DB report local address of clustered DB\n" " schema-name [SCHEMA] report SCHEMA's name\n" " schema-version [SCHEMA] report SCHEMA's schema version\n" " schema-cksum [SCHEMA] report SCHEMA's checksum\n" @@ -203,6 +229,16 @@ check_ovsdb_error(struct ovsdb_error *error) ovs_fatal(0, "%s", ovsdb_error_to_string(error)); } } + +static struct ovsdb_schema * +read_schema(const char *filename) +{ + struct ovsdb_storage *storage = ovsdb_storage_open_standalone(filename, + false); + struct ovsdb_schema *schema = ovsdb_storage_read_schema(storage); + ovsdb_storage_close(storage); + return schema; +} static void do_create(struct ovs_cmdl_context *ctx) @@ -229,46 +265,136 @@ do_create(struct ovs_cmdl_context *ctx) } static void +do_create_cluster(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argv[1]; + const char *src_file_name = ctx->argv[2]; + const char *local = ctx->argv[3]; + + struct ovsdb_schema *schema; + struct json *data; + + struct ovsdb_error *error = ovsdb_schema_from_file(src_file_name, &schema); + if (!error) { + /* It's just a schema file. */ + data = json_object_create(); + } else { + /* Not a schema file. Try reading it as a standalone database. */ + ovsdb_error_destroy(error); + + struct ovsdb *ovsdb = ovsdb_file_read(src_file_name, false); + char *comment = xasprintf("created from %s", src_file_name); + data = ovsdb_to_txn_json(ovsdb, comment); + free(comment); + schema = ovsdb_schema_clone(ovsdb->schema); + ovsdb_destroy(ovsdb); + } + + ovsdb_schema_persist_ephemeral_columns(schema, src_file_name); + + struct json *schema_json = ovsdb_schema_to_json(schema); + + /* Create database file. */ + struct json *snapshot = json_array_create_2(schema_json, data); + check_ovsdb_error(raft_create_cluster(db_file_name, schema->name, + local, snapshot)); + ovsdb_schema_destroy(schema); + json_destroy(snapshot); +} + +static void +do_join_cluster(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argv[1]; + const char *name = ctx->argv[2]; + const char *local = ctx->argv[3]; + + /* Check for a plausible 'name'. */ + if (!ovsdb_parser_is_id(name)) { + ovs_fatal(0, "%s: not a valid schema name (use \"schema-name\" " + "command to find the correct name)", name); + } + + /* Create database file. */ + struct sset remote_addrs = SSET_INITIALIZER(&remote_addrs); + for (size_t i = 4; i < ctx->argc; i++) { + sset_add(&remote_addrs, ctx->argv[i]); + } + check_ovsdb_error(raft_join_cluster(db_file_name, name, local, + &remote_addrs, + uuid_is_zero(&cid) ? NULL : &cid)); + sset_destroy(&remote_addrs); +} + +static struct ovsdb_error * +write_and_free_json(struct ovsdb_log *log, struct json *json) +{ + struct ovsdb_error *error = ovsdb_log_write(log, json); + json_destroy(json); + return error; +} + +static struct ovsdb_error * +write_db(const char *file_name, const char *comment, const struct ovsdb *db) +{ + struct ovsdb_log *log; + struct ovsdb_error *error = ovsdb_log_open(file_name, OVSDB_MAGIC, + OVSDB_LOG_CREATE, false, &log); + if (error) { + return error; + } + + error = write_and_free_json(log, ovsdb_schema_to_json(db->schema)); + if (!error) { + error = write_and_free_json(log, ovsdb_to_txn_json(db, comment)); + } + ovsdb_log_close(log); + + if (error) { + remove(file_name); + } + return error; +} + +static void compact_or_convert(const char *src_name_, const char *dst_name_, - const struct ovsdb_schema *new_schema, - const char *comment) + struct ovsdb_schema *new_schema, const char *comment) { - char *src_name, *dst_name; - struct lockfile *src_lock; - struct lockfile *dst_lock; bool in_place = dst_name_ == NULL; - struct ovsdb *db; - int retval; /* Dereference symlinks for source and destination names. In the in-place * case this ensures that, if the source name is a symlink, we replace its * target instead of replacing the symlink by a regular file. In the * non-in-place, this has the same effect for the destination name. */ - src_name = follow_symlinks(src_name_); - dst_name = (in_place - ? xasprintf("%s.tmp", src_name) - : follow_symlinks(dst_name_)); + char *src_name = follow_symlinks(src_name_); + char *dst_name = (in_place + ? xasprintf("%s.tmp", src_name) + : follow_symlinks(dst_name_)); /* Lock the source, if we will be replacing it. */ + struct lockfile *src_lock = NULL; if (in_place) { - retval = lockfile_lock(src_name, &src_lock); + int retval = lockfile_lock(src_name, &src_lock); if (retval) { ovs_fatal(retval, "%s: failed to lock lockfile", src_name); } } /* Get (temporary) destination and lock it. */ - retval = lockfile_lock(dst_name, &dst_lock); + struct lockfile *dst_lock = NULL; + int retval = lockfile_lock(dst_name, &dst_lock); if (retval) { ovs_fatal(retval, "%s: failed to lock lockfile", dst_name); } /* Save a copy. */ - check_ovsdb_error(new_schema - ? ovsdb_file_open_as_schema(src_name, new_schema, &db) - : ovsdb_file_open(src_name, true, &db, NULL)); - check_ovsdb_error(ovsdb_file_save_copy(dst_name, false, comment, db)); - ovsdb_destroy(db); + struct ovsdb *ovsdb = (new_schema + ? ovsdb_file_read_as_schema(src_name, new_schema) + : ovsdb_file_read(src_name, false)); + ovsdb_storage_close(ovsdb->storage); + ovsdb->storage = NULL; + check_ovsdb_error(write_db(dst_name, comment, ovsdb)); + ovsdb_destroy(ovsdb); /* Replace source. */ if (in_place) { @@ -309,7 +435,6 @@ do_convert(struct ovs_cmdl_context *ctx) check_ovsdb_error(ovsdb_schema_from_file(schema, &new_schema)); compact_or_convert(db, target, new_schema, "converted by ovsdb-tool "VERSION); - ovsdb_schema_destroy(new_schema); } static void @@ -317,9 +442,9 @@ do_needs_conversion(struct ovs_cmdl_context *ctx) { const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); const char *schema_file_name = ctx->argc >= 3 ? ctx->argv[2] : default_schema(); - struct ovsdb_schema *schema1, *schema2; + struct ovsdb_schema *schema1 = read_schema(db_file_name); + struct ovsdb_schema *schema2; - check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema1)); check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema2)); puts(ovsdb_schema_equal(schema1, schema2) ? "no" : "yes"); ovsdb_schema_destroy(schema1); @@ -330,20 +455,39 @@ static void do_db_name(struct ovs_cmdl_context *ctx) { const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); - struct ovsdb_schema *schema; - check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema)); - puts(schema->name); - ovsdb_schema_destroy(schema); + struct ovsdb_log *log; + check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC"|"RAFT_MAGIC, + OVSDB_LOG_READ_ONLY, -1, &log)); + if (!strcmp(ovsdb_log_get_magic(log), OVSDB_MAGIC)) { + struct json *schema_json; + check_ovsdb_error(ovsdb_log_read(log, &schema_json)); + + struct ovsdb_schema *schema; + check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + + puts(schema->name); + + ovsdb_schema_destroy(schema); + json_destroy(schema_json); + } else if (!strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) { + struct raft_metadata md; + check_ovsdb_error(raft_read_metadata(log, &md)); + puts(md.name); + raft_metadata_destroy(&md); + } else { + OVS_NOT_REACHED(); + } + + ovsdb_log_close(log); } static void do_db_version(struct ovs_cmdl_context *ctx) { const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); - struct ovsdb_schema *schema; + struct ovsdb_schema *schema = read_schema(db_file_name); - check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema)); puts(schema->version); ovsdb_schema_destroy(schema); } @@ -352,75 +496,120 @@ static void do_db_cksum(struct ovs_cmdl_context *ctx) { const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); - struct ovsdb_schema *schema; - - check_ovsdb_error(ovsdb_file_read_schema(db_file_name, &schema)); + struct ovsdb_schema *schema = read_schema(db_file_name); puts(schema->cksum); ovsdb_schema_destroy(schema); } +static struct raft_metadata +read_cluster_metadata(const char *filename) +{ + struct ovsdb_log *log; + check_ovsdb_error(ovsdb_log_open(filename, OVSDB_MAGIC"|"RAFT_MAGIC, + OVSDB_LOG_READ_ONLY, -1, &log)); + if (strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) { + ovs_fatal(0, "%s: not a clustered database", filename); + } + + struct raft_metadata md; + check_ovsdb_error(raft_read_metadata(log, &md)); + + ovsdb_log_close(log); + + return md; +} + static void -do_schema_version(struct ovs_cmdl_context *ctx) +do_db_cid(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argv[1]; + struct raft_metadata md = read_cluster_metadata(db_file_name); + if (uuid_is_zero(&md.cid)) { + fprintf(stderr, "%s: cluster ID not yet known\n", db_file_name); + exit(2); + } + printf(UUID_FMT"\n", UUID_ARGS(&md.cid)); + raft_metadata_destroy(&md); +} + +static void +do_db_sid(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argv[1]; + struct raft_metadata md = read_cluster_metadata(db_file_name); + printf(UUID_FMT"\n", UUID_ARGS(&md.sid)); + raft_metadata_destroy(&md); +} + +static void +do_db_local_address(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argv[1]; + struct raft_metadata md = read_cluster_metadata(db_file_name); + puts(md.local); + raft_metadata_destroy(&md); +} + +static void +do_schema_name(struct ovs_cmdl_context *ctx) { const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema(); struct ovsdb_schema *schema; check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema)); - puts(schema->version); + puts(schema->name); ovsdb_schema_destroy(schema); } static void -do_schema_cksum(struct ovs_cmdl_context *ctx) +do_schema_version(struct ovs_cmdl_context *ctx) { const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema(); struct ovsdb_schema *schema; check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema)); - puts(schema->cksum); + puts(schema->version); ovsdb_schema_destroy(schema); } static void -do_schema_name(struct ovs_cmdl_context *ctx) +do_schema_cksum(struct ovs_cmdl_context *ctx) { const char *schema_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_schema(); struct ovsdb_schema *schema; check_ovsdb_error(ovsdb_schema_from_file(schema_file_name, &schema)); - puts(schema->name); + puts(schema->cksum); ovsdb_schema_destroy(schema); } static void -transact(bool read_only, int argc, char *argv[]) +transact(struct ovs_cmdl_context *ctx, bool rw) { - const char *db_file_name = argc >= 3 ? argv[1] : default_db(); - const char *transaction = argv[argc - 1]; - struct json *request, *result; - struct ovsdb *db; - - check_ovsdb_error(ovsdb_file_open(db_file_name, read_only, &db, NULL)); + const char *db_file_name = ctx->argc >= 3 ? ctx->argv[1] : default_db(); + const char *transaction = ctx->argv[ctx->argc - 1]; - request = parse_json(transaction); - result = ovsdb_execute(db, NULL, request, false, rbac_role, NULL, 0, NULL); + struct ovsdb *ovsdb = ovsdb_file_read(db_file_name, rw); + struct json *request = parse_json(transaction); + struct json *result = ovsdb_execute(ovsdb, NULL, request, false, + rbac_role, NULL, 0, NULL); json_destroy(request); print_and_free_json(result); - ovsdb_destroy(db); + ovsdb_destroy(ovsdb); } static void do_query(struct ovs_cmdl_context *ctx) { - transact(true, ctx->argc, ctx->argv); + transact(ctx, false); } static void do_transact(struct ovs_cmdl_context *ctx) { - transact(false, ctx->argc, ctx->argv); + transact(ctx, true); } static void @@ -429,6 +618,7 @@ print_db_changes(struct shash *tables, struct shash *names, { struct shash_node *n1; + int i = 0; SHASH_FOR_EACH (n1, tables) { const char *table = n1->name; struct ovsdb_table_schema *table_schema; @@ -439,7 +629,11 @@ print_db_changes(struct shash *tables, struct shash *names, continue; } - table_schema = shash_find_data(&schema->tables, table); + if (i++ == 0) { + putchar('\n'); + } + + table_schema = schema ? shash_find_data(&schema->tables, table) : NULL; SHASH_FOR_EACH (n2, json_object(rows)) { const char *row_uuid = n2->name; struct json *columns = n2->data; @@ -536,19 +730,47 @@ print_db_changes(struct shash *tables, struct shash *names, } static void -do_show_log(struct ovs_cmdl_context *ctx) +print_change_record(const struct json *json, const struct ovsdb_schema *schema, + struct shash *names) { - const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); - struct shash names; - struct ovsdb_log *log; - struct ovsdb_schema *schema; - unsigned int i; + if (!json || json->type != JSON_OBJECT) { + return; + } - check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC, - OVSDB_LOG_READ_ONLY, -1, &log)); - shash_init(&names); - schema = NULL; - for (i = 0; ; i++) { + struct json *date, *comment; + + date = shash_find_data(json_object(json), "_date"); + if (date && date->type == JSON_INTEGER) { + long long int t = json_integer(date); + char *s; + + if (t < INT32_MAX) { + /* Older versions of ovsdb wrote timestamps in seconds. */ + t *= 1000; + } + + s = xastrftime_msec(" %Y-%m-%d %H:%M:%S.###", t, true); + fputs(s, stdout); + free(s); + } + + comment = shash_find_data(json_object(json), "_comment"); + if (comment && comment->type == JSON_STRING) { + printf(" \"%s\"", json_string(comment)); + } + + if (show_log_verbosity > 0) { + print_db_changes(json_object(json), names, schema); + } +} + +static void +do_show_log_standalone(struct ovsdb_log *log) +{ + struct shash names = SHASH_INITIALIZER(&names); + struct ovsdb_schema *schema = NULL; + + for (unsigned int i = 0; ; i++) { struct json *json; check_ovsdb_error(ovsdb_log_read(log, &json)); @@ -561,44 +783,639 @@ do_show_log(struct ovs_cmdl_context *ctx) check_ovsdb_error(ovsdb_schema_from_json(json, &schema)); printf(" \"%s\" schema, version=\"%s\", cksum=\"%s\"\n", schema->name, schema->version, schema->cksum); - } else if (json->type == JSON_OBJECT) { - struct json *date, *comment; + } else { + print_change_record(json, schema, &names); + } + json_destroy(json); + putchar('\n'); + } - date = shash_find_data(json_object(json), "_date"); - if (date && date->type == JSON_INTEGER) { - long long int t = json_integer(date); - char *s; + ovsdb_schema_destroy(schema); + /* XXX free 'names'. */ +} - if (t < INT32_MAX) { - /* Older versions of ovsdb wrote timestamps in seconds. */ - t *= 1000; - } +static void +print_servers(const char *name, const struct json *servers) +{ + if (!servers) { + return; + } - s = xastrftime_msec(" %Y-%m-%d %H:%M:%S.###", t, true); - fputs(s, stdout); - free(s); - } + printf(" %s: ", name); - comment = shash_find_data(json_object(json), "_comment"); - if (comment && comment->type == JSON_STRING) { - printf(" \"%s\"", json_string(comment)); - } + const struct shash_node **nodes = shash_sort(json_object(servers)); + size_t n = shash_count(json_object(servers)); + for (size_t i = 0; i < n; i++) { + if (i > 0) { + printf(", "); + } + + const struct shash_node *node = nodes[i]; + printf("%.4s(", node->name); + + const struct json *address = node->data; + char *s = json_to_string(address, JSSF_SORT); + fputs(s, stdout); + free(s); + + putchar(')'); + } + free(nodes); + putchar('\n'); +} + +static void +print_data(const char *prefix, const struct json *data, + struct ovsdb_schema **schemap, struct shash *names) +{ + if (!data) { + return; + } + + if (json_array(data)->n != 2) { + printf(" ***invalid data***\n"); + return; + } + + const struct json *schema_json = json_array(data)->elems[0]; + if (schema_json->type != JSON_NULL) { + struct ovsdb_schema *schema; + + check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + printf(" %sschema: \"%s\", version=\"%s\", cksum=\"%s\"\n", + prefix, schema->name, schema->version, schema->cksum); + + ovsdb_schema_destroy(*schemap); + *schemap = schema; + } + + print_change_record(json_array(data)->elems[1], *schemap, names); +} + +static void +print_raft_header(const struct raft_header *h, + struct ovsdb_schema **schemap, struct shash *names) +{ + printf(" name: \"%s\'\n", h->name); + printf(" local address: \"%s\"\n", h->local_address); + printf(" server_id: "SID_FMT"\n", SID_ARGS(&h->sid)); + if (!uuid_is_zero(&h->cid)) { + printf(" cluster_id: "CID_FMT"\n", CID_ARGS(&h->cid)); + } + if (!sset_is_empty(&h->remote_addresses)) { + printf(" remote_addresses:"); + + const char *s; + SSET_FOR_EACH (s, &h->remote_addresses) { + printf(" %s", s); + } + putchar('\n'); + } + if (h->snap_index) { + printf(" prev_index: %"PRIu64"\n", h->snap_index); + printf(" prev_term: %"PRIu64"\n", h->snap.term); + print_servers("prev_servers", h->snap.servers); + if (!uuid_is_zero(&h->snap.eid)) { + printf(" prev_eid: %04x\n", uuid_prefix(&h->snap.eid, 4)); + } + print_data("prev_", h->snap.data, schemap, names); + } +} + +static void +print_raft_record(const struct raft_record *r, + struct ovsdb_schema **schemap, struct shash *names) +{ + if (r->comment) { + printf(" comment: \"%s\"\n", r->comment); + } + if (r->term) { + printf(" term: %"PRIu64"\n", r->term); + } + + switch (r->type) { + case RAFT_REC_ENTRY: + printf(" index: %"PRIu64"\n", r->entry.index); + print_servers("servers", r->entry.servers); + if (!uuid_is_zero(&r->entry.eid)) { + printf(" eid: %04x\n", uuid_prefix(&r->entry.eid, 4)); + } + print_data("", r->entry.data, schemap, names); + break; + + case RAFT_REC_TERM: + break; + + case RAFT_REC_VOTE: + printf(" vote: "SID_FMT"\n", SID_ARGS(&r->sid)); + break; + + case RAFT_REC_NOTE: + printf(" note: \"%s\"\n", r->note); + break; + + case RAFT_REC_COMMIT_INDEX: + printf(" commit_index: %"PRIu64"\n", r->commit_index); + break; + + case RAFT_REC_LEADER: + printf(" leader: "SID_FMT"\n", SID_ARGS(&r->sid)); + break; + + default: + OVS_NOT_REACHED(); + } +} + +static void +do_show_log_cluster(struct ovsdb_log *log) +{ + struct shash names = SHASH_INITIALIZER(&names); + struct ovsdb_schema *schema = NULL; + unsigned int i; + + shash_init(&names); + schema = NULL; + for (i = 0; ; i++) { + struct json *json; + check_ovsdb_error(ovsdb_log_read(log, &json)); + if (!json) { + break; + } - if (i > 0 && show_log_verbosity > 0) { - putchar('\n'); - print_db_changes(json_object(json), &names, schema); + printf("record %u:\n", i); + struct ovsdb_error *error; + if (i == 0) { + struct raft_header h; + error = raft_header_from_json(&h, json); + if (!error) { + print_raft_header(&h, &schema, &names); + raft_header_uninit(&h); + } + } else { + struct raft_record r; + error = raft_record_from_json(&r, json); + if (!error) { + print_raft_record(&r, &schema, &names); + raft_record_uninit(&r); } } - json_destroy(json); + if (error) { + char *s = ovsdb_error_to_string_free(error); + puts(s); + free(s); + } + putchar('\n'); } - ovsdb_log_close(log); ovsdb_schema_destroy(schema); /* XXX free 'names'. */ } static void +do_show_log(struct ovs_cmdl_context *ctx) +{ + const char *db_file_name = ctx->argc >= 2 ? ctx->argv[1] : default_db(); + struct ovsdb_log *log; + + check_ovsdb_error(ovsdb_log_open(db_file_name, OVSDB_MAGIC"|"RAFT_MAGIC, + OVSDB_LOG_READ_ONLY, -1, &log)); + if (!strcmp(ovsdb_log_get_magic(log), OVSDB_MAGIC)) { + do_show_log_standalone(log); + } else { + do_show_log_cluster(log); + } + ovsdb_log_close(log); +} + +struct server { + const char *filename; + const char *nickname; + + struct raft_header header; + + struct raft_record *records; + size_t n_records; + + struct raft_entry *snap; + struct raft_entry *entries; + uint64_t log_start, log_end; +}; + +struct leader { + /* In struct cluster's 'leaders', indexed by 'term'. */ + struct hmap_node hmap_node; + + /* This structure indicates that in 'term', 'server' reported that 'leader' + * was elected leader. When 'log_end' is nonzero, it additionally + * indicates 'leader''s log_end at the time it was elected. */ + uint64_t term; + struct server *server; + struct uuid leader; + uint64_t log_end; +}; + +struct commit { + /* In struct cluster's 'commits', indexed by 'term'. */ + struct hmap_node hmap_node; + + /* This structure indicates that in 'term', 'server' reported the commit + * index as 'index'. */ + uint64_t term; + struct server *server; + uint64_t index; /* Commit index. */ +}; + +struct cluster { + struct server *servers; + size_t n_servers; + + struct hmap leaders; /* Contains 'struct leader's. */ + + struct hmap commits; /* Contains 'struct commit's. */ +}; + +static const char * +get_server_name(const struct cluster *c, const struct uuid *sid, + char buf[SID_LEN + 1], size_t bufsize) +{ + for (size_t i = 0; i < c->n_servers; i++) { + struct server *s = &c->servers[c->n_servers]; + if (uuid_equals(&s->header.sid, sid)) { + return s->filename; + } + } + + snprintf(buf, bufsize, SID_FMT, SID_ARGS(sid)); + return buf; +} + +static struct leader * +find_leader(struct cluster *c, uint64_t term) +{ + struct leader *leader; + HMAP_FOR_EACH_WITH_HASH (leader, hmap_node, hash_uint64(term), + &c->leaders) { + if (term == leader->term) { + return leader; + } + } + return NULL; +} + +/* Records that 'server' reported that 'leader' was elected leader in 'term'. + * + * Checks the Election Safety Property: at most one leader may be elected in a + * single term (see Figure 3.2). */ +static void +record_leader(struct cluster *c, uint64_t term, struct server *server, + const struct uuid *leader) +{ + bool server_is_leader = uuid_equals(&server->header.sid, leader); + struct leader *p = find_leader(c, term); + if (p) { + if (!uuid_equals(&p->leader, leader)) { + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + ovs_fatal(0, "term %"PRIu64" has two different leaders: " + "%s says that the leader is %s and " + "%s says that the leader is %s", + term, + p->server->filename, + get_server_name(c, &p->leader, buf1, sizeof buf1), + server->filename, + get_server_name(c, leader, buf2, sizeof buf2)); + } + if (server_is_leader && server->log_end > p->log_end) { + p->log_end = server->log_end; + } + } else { + p = xmalloc(sizeof *p); + hmap_insert(&c->leaders, &p->hmap_node, hash_uint64(term)); + p->term = term; + p->server = server; + p->leader = *leader; + if (server_is_leader) { + p->log_end = server->log_end; + } else { + p->log_end = 0; + } + } +} + +static struct commit * +find_commit(struct cluster *c, uint64_t term) +{ + struct commit *commit; + HMAP_FOR_EACH_WITH_HASH (commit, hmap_node, hash_uint64(term), + &c->commits) { + if (term == commit->term) { + return commit; + } + } + return NULL; +} + +static void +record_commit(struct cluster *c, uint64_t term, struct server *server, + uint64_t commit_index) +{ + struct commit *commit = find_commit(c, term); + if (commit) { + if (commit_index > commit->index) { + commit->server = server; + commit->index = commit_index; + } + } else { + commit = xmalloc(sizeof *commit); + hmap_insert(&c->commits, &commit->hmap_node, hash_uint64(term)); + commit->term = term; + commit->server = server; + commit->index = commit_index; + } +} + +static void +do_check_cluster(struct ovs_cmdl_context *ctx) +{ + struct cluster c = { + .servers = xzalloc((ctx->argc - 1) * sizeof *c.servers), + .n_servers = 0, + .leaders = HMAP_INITIALIZER(&c.leaders), + .commits = HMAP_INITIALIZER(&c.commits), + }; + + uint64_t min_term = UINT64_MAX; + uint64_t max_term = 0; + + for (int i = 1; i < ctx->argc; i++) { + struct server *s = &c.servers[c.n_servers]; + s->filename = ctx->argv[i]; + + struct ovsdb_log *log; + check_ovsdb_error(ovsdb_log_open(s->filename, RAFT_MAGIC, + OVSDB_LOG_READ_ONLY, -1, &log)); + + struct json *json; + check_ovsdb_error(ovsdb_log_read(log, &json)); + check_ovsdb_error(raft_header_from_json(&s->header, json)); + json_destroy(json); + + if (s->header.joining) { + printf("%s has not joined the cluster, omitting\n", s->filename); + continue; + } + if (c.n_servers > 0) { + struct server *s0 = &c.servers[0]; + if (!uuid_equals(&s0->header.cid, &s->header.cid)) { + ovs_fatal(0, "%s has cluster ID "CID_FMT" but %s " + "has cluster ID "CID_FMT, + s0->filename, CID_ARGS(&s0->header.cid), + s->filename, CID_ARGS(&s->header.cid)); + } + if (strcmp(s0->header.name, s->header.name)) { + ovs_fatal(0, "%s is named \"%s\" but %s is named \"%s\"", + s0->filename, s0->header.name, + s->filename, s->header.name); + } + } + s->snap = &s->header.snap; + s->log_start = s->log_end = s->header.snap_index + 1; + + size_t allocated_records = 0; + size_t allocated_entries = 0; + + uint64_t term = 0; /* Current term. */ + struct uuid vote = UUID_ZERO; /* Server 's''s vote in 'term'. */ + struct uuid leader = UUID_ZERO; /* Cluster leader in 'term'. */ + uint64_t leader_rec_idx = 0; /* Index of last "leader" record. */ + + uint64_t commit_index = s->header.snap_index; + + for (unsigned long long int rec_idx = 1; ; rec_idx++) { + if (s->n_records >= allocated_records) { + s->records = x2nrealloc(s->records, &allocated_records, + sizeof *s->records); + } + check_ovsdb_error(ovsdb_log_read(log, &json)); + if (!json) { + break; + } + struct raft_record *r = &s->records[s->n_records++]; + check_ovsdb_error(raft_record_from_json(r, json)); + json_destroy(json); + + if (r->term > term) { + term = r->term; + vote = UUID_ZERO; + leader = UUID_ZERO; + leader_rec_idx = 0; + } + if (term < min_term) { + min_term = term; + } + if (term > max_term) { + max_term = term; + } + + + switch (r->type) { + case RAFT_REC_ENTRY: + if (r->entry.index < commit_index) { + ovs_fatal(0, "%s: record %llu attempts to truncate log " + "from %"PRIu64" to %"PRIu64" entries, but " + "commit index is already %"PRIu64, + s->filename, rec_idx, + s->log_end, r->entry.index, + commit_index); + } else if (r->entry.index > s->log_end) { + ovs_fatal(0, "%s: record %llu with index %"PRIu64" skips " + "past expected index %"PRIu64, s->filename, + rec_idx, r->entry.index, s->log_end); + } + + if (r->entry.index < s->log_end) { + bool is_leader = uuid_equals(&s->header.sid, &leader); + if (is_leader) { + /* Leader Append-Only property (see Figure 3.2). */ + ovs_fatal(0, "%s: record %llu truncates log from " + "%"PRIu64" to %"PRIu64" entries while " + "server is leader", s->filename, rec_idx, + s->log_end, r->entry.index); + } else { + /* This can happen, but it is unusual. */ + printf("%s: record %llu truncates log from %"PRIu64 + " to %"PRIu64" entries\n", s->filename, rec_idx, + s->log_end, r->entry.index); + } + s->log_end = r->entry.index; + } + + uint64_t prev_term = (s->log_end > s->log_start + ? s->entries[s->log_end + - s->log_start - 1].term + : s->snap->term); + if (r->term < prev_term) { + ovs_fatal(0, "%s: record %llu with index %"PRIu64" term " + "%"PRIu64" precedes previous entry's term " + "%"PRIu64, s->filename, rec_idx, + r->entry.index, r->term, prev_term); + } + + uint64_t log_idx = s->log_end++ - s->log_start; + if (log_idx >= allocated_entries) { + s->entries = x2nrealloc(s->entries, &allocated_entries, + sizeof *s->entries); + } + struct raft_entry *e = &s->entries[log_idx]; + e->term = r->term; + e->data = r->entry.data; + e->eid = r->entry.eid; + e->servers = r->entry.servers; + break; + + case RAFT_REC_TERM: + break; + + case RAFT_REC_VOTE: + if (r->term < term) { + ovs_fatal(0, "%s: record %llu votes for term %"PRIu64" " + "but current term is %"PRIu64, s->filename, + rec_idx, r->term, term); + } else if (!uuid_is_zero(&vote) + && !uuid_equals(&vote, &r->sid)) { + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + ovs_fatal(0, "%s: record %llu votes for %s in term " + "%"PRIu64" but a previous record for the " + "same term voted for %s", s->filename, + rec_idx, + get_server_name(&c, &vote, buf1, sizeof buf1), + r->term, + get_server_name(&c, &r->sid, buf2, sizeof buf2)); + } else { + vote = r->sid; + } + break; + + case RAFT_REC_NOTE: + if (!strcmp(r->note, "left")) { + printf("%s: record %llu shows that the server left the " + "cluster\n", s->filename, rec_idx); + } + break; + + case RAFT_REC_COMMIT_INDEX: + if (r->commit_index < commit_index) { + ovs_fatal(0, "%s: record %llu regresses commit index " + "from %"PRIu64 " to %"PRIu64, s->filename, + rec_idx, commit_index, r->commit_index); + } else if (r->commit_index >= s->log_end) { + ovs_fatal(0, "%s: record %llu advances commit index to " + "%"PRIu64 " but last log index is %"PRIu64, + s->filename, rec_idx, r->commit_index, + s->log_end - 1); + } else { + commit_index = r->commit_index; + } + + record_commit(&c, term, s, r->commit_index); + break; + + case RAFT_REC_LEADER: + if (!uuid_equals(&r->sid, &leader)) { + if (uuid_is_zero(&leader)) { + leader = r->sid; + leader_rec_idx = rec_idx; + } else { + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + ovs_fatal(0, "%s: record %llu reports leader %s " + "for term %"PRIu64" but record %llu " + "previously reported the leader as %s " + "in that term", + s->filename, rec_idx, + get_server_name(&c, &r->sid, + buf1, sizeof buf1), + term, leader_rec_idx, + get_server_name(&c, &leader, + buf2, sizeof buf2)); + } + } + record_leader(&c, term, s, &r->sid); + break; + } + } + + c.n_servers++; + } + + /* Check the Leader Completeness property from Figure 3.2: If a log entry + * is committed in a given term, then that entry will be present in the + * logs of the leaders for all higher-numbered terms. */ + if (min_term == UINT64_MAX || max_term == 0) { + ovs_fatal(0, "all logs are empty"); + } + struct commit *commit = NULL; + for (uint64_t term = min_term; term <= max_term; term++) { + struct leader *leader = find_leader(&c, term); + if (leader && commit && commit->index >= leader->log_end) { + ovs_fatal(0, "leader %s for term %"PRIu64" has log entries only " + "up to index %"PRIu64", but index %"PRIu64" was " + "committed in a previous term (e.g. by %s)", + leader->server->filename, term, leader->log_end - 1, + commit->index, commit->server->filename); + } + + struct commit *next = find_commit(&c, term); + if (next && (!commit || next->index > commit->index)) { + commit = next; + } + } + + /* Section 3.5: Check the Log Matching Property in Figure 3.2: + * + * - If two entries in different logs have the same index and term, then + * they store the same command. + * + * - If two entries in different logs have the same index and term, then + * the logs are identical in all preceding entries. + */ + for (size_t i = 0; i < c.n_servers; i++) { + for (size_t j = 0; j < c.n_servers; j++) { + struct server *a = &c.servers[i]; + struct server *b = &c.servers[j]; + + if (a == b) { + continue; + } + + bool must_equal = false; + for (uint64_t idx = MIN(a->log_end, b->log_end) - 1; + idx >= MAX(a->log_start, b->log_start); + idx--) { + const struct raft_entry *ae = &a->entries[idx - a->log_start]; + const struct raft_entry *be = &b->entries[idx - b->log_start]; + if (ae->term == be->term) { + must_equal = true; + } + if (!must_equal || raft_entry_equals(ae, be)) { + continue; + } + char *as = json_to_string(raft_entry_to_json(ae), JSSF_SORT); + char *bs = json_to_string(raft_entry_to_json(be), JSSF_SORT); + ovs_fatal(0, "log entries with index %"PRIu64" differ:\n" + "%s has %s\n" + "%s has %s\n", + idx, a->filename, as, b->filename, bs); + } + } + + } +} + +static void do_help(struct ovs_cmdl_context *ctx OVS_UNUSED) { usage(); @@ -612,18 +1429,25 @@ do_list_commands(struct ovs_cmdl_context *ctx OVS_UNUSED) static const struct ovs_cmdl_command all_commands[] = { { "create", "[db [schema]]", 0, 2, do_create, OVS_RW }, + { "create-cluster", "db contents local", 3, 3, do_create_cluster, OVS_RW }, + { "join-cluster", "db name local remote...", 4, INT_MAX, do_join_cluster, + OVS_RW }, { "compact", "[db [dst]]", 0, 2, do_compact, OVS_RW }, { "convert", "[db [schema [dst]]]", 0, 3, do_convert, OVS_RW }, { "needs-conversion", NULL, 0, 2, do_needs_conversion, OVS_RO }, { "db-name", "[db]", 0, 1, do_db_name, OVS_RO }, { "db-version", "[db]", 0, 1, do_db_version, OVS_RO }, { "db-cksum", "[db]", 0, 1, do_db_cksum, OVS_RO }, + { "db-cid", "db", 1, 1, do_db_cid, OVS_RO }, + { "db-sid", "db", 1, 1, do_db_sid, OVS_RO }, + { "db-local-address", "db", 1, 1, do_db_local_address, OVS_RO }, { "schema-name", "[schema]", 0, 1, do_schema_name, OVS_RO }, { "schema-version", "[schema]", 0, 1, do_schema_version, OVS_RO }, { "schema-cksum", "[schema]", 0, 1, do_schema_cksum, OVS_RO }, { "query", "[db] trns", 1, 2, do_query, OVS_RO }, { "transact", "[db] trns", 1, 2, do_transact, OVS_RO }, { "show-log", "[db]", 0, 1, do_show_log, OVS_RO }, + { "check-cluster", "db...", 1, INT_MAX, do_check_cluster, OVS_RO }, { "help", NULL, 0, INT_MAX, do_help, OVS_RO }, { "list-commands", NULL, 0, INT_MAX, do_list_commands, OVS_RO }, { NULL, NULL, 0, 0, NULL, OVS_RO }, diff --git a/ovsdb/ovsdb-util.c b/ovsdb/ovsdb-util.c index 06d25af49a18..6757e92ec0b4 100644 --- a/ovsdb/ovsdb-util.c +++ b/ovsdb/ovsdb-util.c @@ -22,7 +22,7 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_util); -static void +void ovsdb_util_clear_column(struct ovsdb_row *row, const char *column_name) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); @@ -235,6 +235,15 @@ ovsdb_util_write_singleton(struct ovsdb_row *row, const char *column_name, } void +ovsdb_util_write_integer_column(struct ovsdb_row *row, + const char *column_name, + long long int integer) +{ + const union ovsdb_atom atom = { .integer = integer }; + ovsdb_util_write_singleton(row, column_name, &atom, OVSDB_TYPE_INTEGER); +} + +void ovsdb_util_write_bool_column(struct ovsdb_row *row, const char *column_name, bool value) { diff --git a/ovsdb/ovsdb-util.h b/ovsdb/ovsdb-util.h index a0404a3a7ff0..992a7442cb22 100644 --- a/ovsdb/ovsdb-util.h +++ b/ovsdb/ovsdb-util.h @@ -17,6 +17,7 @@ #define OVSDB_UTIL_H 1 /* Database access utility functions. */ +void ovsdb_util_clear_column(struct ovsdb_row *, const char *column_name); struct ovsdb_datum *ovsdb_util_get_datum(struct ovsdb_row *row, const char *column_name, const enum ovsdb_atomic_type keytype, @@ -35,6 +36,9 @@ const union ovsdb_atom *ovsdb_util_read_column(const struct ovsdb_row *row, bool ovsdb_util_read_integer_column(const struct ovsdb_row *row, const char *column_name, long long int *integerp); +void ovsdb_util_write_integer_column(struct ovsdb_row *row, + const char *column_name, + long long int integer); bool ovsdb_util_read_string_column(const struct ovsdb_row *row, const char *column_name, const char **stringp); diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 89f530bcccfb..2ee2e51b4b80 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,10 +25,14 @@ #include "ovsdb-parser.h" #include "ovsdb-types.h" #include "simap.h" +#include "storage.h" #include "table.h" #include "transaction.h" #include "trigger.h" +#include "openvswitch/vlog.h" +VLOG_DEFINE_THIS_MODULE(ovsdb); + struct ovsdb_schema * ovsdb_schema_create(const char *name, const char *version, const char *cksum) { @@ -310,6 +314,61 @@ ovsdb_schema_equal(const struct ovsdb_schema *a, return equals; } + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_schema_check_for_ephemeral_columns(const struct ovsdb_schema *schema) +{ + struct shash_node *node; + SHASH_FOR_EACH (node, &schema->tables) { + struct ovsdb_table_schema *table = node->data; + struct shash_node *node2; + + SHASH_FOR_EACH (node2, &table->columns) { + struct ovsdb_column *column = node2->data; + + if (column->index >= OVSDB_N_STD_COLUMNS && !column->persistent) { + return ovsdb_syntax_error( + NULL, NULL, "Table %s column %s is ephemeral but " + "clustered databases do not support ephemeral columns.", + table->name, column->name); + } + } + } + return NULL; +} + +void +ovsdb_schema_persist_ephemeral_columns(struct ovsdb_schema *schema, + const char *filename) +{ + int n = 0; + const char *example_table = NULL; + const char *example_column = NULL; + + struct shash_node *node; + SHASH_FOR_EACH (node, &schema->tables) { + struct ovsdb_table_schema *table = node->data; + struct shash_node *node2; + + SHASH_FOR_EACH (node2, &table->columns) { + struct ovsdb_column *column = node2->data; + + if (column->index >= OVSDB_N_STD_COLUMNS && !column->persistent) { + column->persistent = true; + example_table = table->name; + example_column = column->name; + n++; + } + } + } + + if (n) { + VLOG_WARN("%s: changed %d columns in '%s' database from ephemeral to " + "persistent, including '%s' column in '%s' table, because " + "clusters do not support ephemeral columns", + filename, n, schema->name, example_column, example_table); + } +} static void ovsdb_set_ref_table(const struct shash *tables, @@ -323,35 +382,41 @@ ovsdb_set_ref_table(const struct shash *tables, } } +/* XXX add prereq parameter? */ struct ovsdb * -ovsdb_create(struct ovsdb_schema *schema) +ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage) { struct shash_node *node; struct ovsdb *db; - db = xmalloc(sizeof *db); + db = xzalloc(sizeof *db); + db->name = xstrdup(schema + ? schema->name + : ovsdb_storage_get_name(storage)); db->schema = schema; - db->file = NULL; + db->storage = storage; ovs_list_init(&db->monitors); ovs_list_init(&db->triggers); db->run_triggers = false; shash_init(&db->tables); - SHASH_FOR_EACH (node, &schema->tables) { - struct ovsdb_table_schema *ts = node->data; - shash_add(&db->tables, node->name, ovsdb_table_create(ts)); - } + if (schema) { + SHASH_FOR_EACH (node, &schema->tables) { + struct ovsdb_table_schema *ts = node->data; + shash_add(&db->tables, node->name, ovsdb_table_create(ts)); + } - /* Set all the refTables. */ - SHASH_FOR_EACH (node, &schema->tables) { - struct ovsdb_table_schema *table = node->data; - struct shash_node *node2; + /* Set all the refTables. */ + SHASH_FOR_EACH (node, &schema->tables) { + struct ovsdb_table_schema *table = node->data; + struct shash_node *node2; - SHASH_FOR_EACH (node2, &table->columns) { - struct ovsdb_column *column = node2->data; + SHASH_FOR_EACH (node2, &table->columns) { + struct ovsdb_column *column = node2->data; - ovsdb_set_ref_table(&db->tables, &column->type.key); - ovsdb_set_ref_table(&db->tables, &column->type.value); + ovsdb_set_ref_table(&db->tables, &column->type.key); + ovsdb_set_ref_table(&db->tables, &column->type.value); + } } } @@ -362,38 +427,13 @@ ovsdb_create(struct ovsdb_schema *schema) } void -ovsdb_replace(struct ovsdb *dst, struct ovsdb *src) -{ - /* Cancel monitors. */ - ovsdb_monitor_prereplace_db(dst); - - /* Cancel triggers. */ - struct ovsdb_trigger *trigger, *next; - LIST_FOR_EACH_SAFE (trigger, next, node, &dst->triggers) { - ovsdb_trigger_prereplace_db(trigger); - } - - struct ovsdb_schema *tmp_schema = dst->schema; - dst->schema = src->schema; - src->schema = tmp_schema; - - shash_swap(&dst->tables, &src->tables); - - dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role"); - - ovsdb_destroy(src); -} - -void ovsdb_destroy(struct ovsdb *db) { if (db) { struct shash_node *node; /* Close the log. */ - if (db->file) { - ovsdb_file_destroy(db->file); - } + ovsdb_storage_close(db->storage); /* Remove all the monitors. */ ovsdb_monitors_remove(db); @@ -408,9 +448,12 @@ ovsdb_destroy(struct ovsdb *db) /* The schemas, but not the table that points to them, were deleted in * the previous step, so we need to clear out the table. We can't * destroy the table, because ovsdb_schema_destroy() will do that. */ - shash_clear(&db->schema->tables); + if (db->schema) { + shash_clear(&db->schema->tables); + ovsdb_schema_destroy(db->schema); + } - ovsdb_schema_destroy(db->schema); + free(db->name); free(db); } } @@ -420,6 +463,10 @@ ovsdb_destroy(struct ovsdb *db) void ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage) { + if (!db->schema) { + return; + } + const struct shash_node *node; unsigned int cells = 0; @@ -439,3 +486,42 @@ ovsdb_get_table(const struct ovsdb *db, const char *name) { return shash_find_data(&db->tables, name); } + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_snapshot(struct ovsdb *db) +{ + if (!db->storage) { + return NULL; + } + + struct json *schema = ovsdb_schema_to_json(db->schema); + struct json *data = ovsdb_to_txn_json(db, "compacting database online"); + struct ovsdb_error *error = ovsdb_storage_store_snapshot(db->storage, + schema, data); + json_destroy(schema); + json_destroy(data); + return error; +} + +void +ovsdb_replace(struct ovsdb *dst, struct ovsdb *src) +{ + /* Cancel monitors. */ + ovsdb_monitor_prereplace_db(dst); + + /* Cancel triggers. */ + struct ovsdb_trigger *trigger, *next; + LIST_FOR_EACH_SAFE (trigger, next, node, &dst->triggers) { + ovsdb_trigger_prereplace_db(trigger); + } + + struct ovsdb_schema *tmp_schema = dst->schema; + dst->schema = src->schema; + src->schema = tmp_schema; + + shash_swap(&dst->tables, &src->tables); + + dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role"); + + ovsdb_destroy(src); +} diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index c3e8f2091e35..c5dad832c45d 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "openvswitch/hmap.h" #include "openvswitch/list.h" #include "openvswitch/shash.h" +#include "openvswitch/uuid.h" struct json; struct ovsdb_log; struct ovsdb_session; struct ovsdb_txn; struct simap; -struct uuid; /* Database schema. */ struct ovsdb_schema { @@ -52,11 +52,23 @@ struct json *ovsdb_schema_to_json(const struct ovsdb_schema *); bool ovsdb_schema_equal(const struct ovsdb_schema *, const struct ovsdb_schema *); + +struct ovsdb_error *ovsdb_schema_check_for_ephemeral_columns( + const struct ovsdb_schema *) OVS_WARN_UNUSED_RESULT; +void ovsdb_schema_persist_ephemeral_columns(struct ovsdb_schema *, + const char *filename); /* Database. */ +enum ovsdb_state { + OVSDB_LOADING, + OVSDB_RUNNING +}; + struct ovsdb { + char *name; struct ovsdb_schema *schema; - struct ovsdb_file *file; /* If nonnull, log for transactions. */ + struct ovsdb_storage *storage; /* If nonnull, log for transactions. */ + struct uuid prereq; struct ovs_list monitors; /* Contains "struct ovsdb_monitor"s. */ struct shash tables; /* Contains "struct ovsdb_table *"s. */ @@ -67,18 +79,27 @@ struct ovsdb { struct ovsdb_table *rbac_role; }; -struct ovsdb *ovsdb_create(struct ovsdb_schema *); -void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src); +struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *); void ovsdb_destroy(struct ovsdb *); void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage); struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *); +struct ovsdb_txn *ovsdb_execute_compose( + struct ovsdb *, const struct ovsdb_session *, const struct json *params, + bool read_only, const char *role, const char *id, + long long int elapsed_msec, long long int *timeout_msec, + bool *durable, struct json **); + struct json *ovsdb_execute(struct ovsdb *, const struct ovsdb_session *, const struct json *params, bool read_only, const char *role, const char *id, long long int elapsed_msec, long long int *timeout_msec); +struct ovsdb_error *ovsdb_snapshot(struct ovsdb *) OVS_WARN_UNUSED_RESULT; + +void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src); + #endif /* ovsdb/ovsdb.h */ diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c new file mode 100644 index 000000000000..457d1292a949 --- /dev/null +++ b/ovsdb/raft-private.c @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "raft-private.h" + +#include "openvswitch/dynamic-string.h" +#include "ovsdb-error.h" +#include "ovsdb-parser.h" +#include "socket-util.h" +#include "sset.h" + +/* Addresses of Raft servers. */ + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_address_validate(const char *address) +{ + if (!strncmp(address, "unix:", 5)) { + return NULL; + } else if (!strncmp(address, "ssl:", 4) || !strncmp(address, "tcp:", 4)) { + struct sockaddr_storage ss; + if (!inet_parse_active(address + 4, 0, &ss)) { + return ovsdb_error(NULL, "%s: syntax error in address", address); + } + return NULL; + } else { + return ovsdb_error(NULL, "%s: expected \"tcp\" or \"ssl\" address", + address); + } +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_address_validate_json(const struct json *address) +{ + if (address->type != JSON_STRING) { + return ovsdb_syntax_error(address, NULL, + "server address is not string"); + } + return raft_address_validate(json_string(address)); +} + +char * +raft_address_to_nickname(const char *address, const struct uuid *sid) +{ + if (!strncmp(address, "unix:", 5)) { + const char *p = address + 5; + + const char *slash = strrchr(p, '/'); + if (slash) { + p = slash + 1; + } + + int len = strcspn(p, "."); + if (len) { + return xmemdup0(p, len); + } + } + + return xasprintf(SID_FMT, SID_ARGS(sid)); +} + +/* Sets of Raft server addresses. */ + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_addresses_from_json(const struct json *json, struct sset *addresses) +{ + sset_init(addresses); + + const struct json_array *array = json_array(json); + if (!array->n) { + return ovsdb_syntax_error(json, NULL, + "at least one remote address is required"); + } + for (size_t i = 0; i < array->n; i++) { + const struct json *address = array->elems[i]; + struct ovsdb_error *error = raft_address_validate_json(address); + if (error) { + sset_destroy(addresses); + sset_init(addresses); + return error; + } + sset_add(addresses, json_string(address)); + } + return NULL; +} + +struct json * +raft_addresses_to_json(const struct sset *sset) +{ + struct json *array; + const char *s; + + array = json_array_create_empty(); + SSET_FOR_EACH (s, sset) { + json_array_add(array, json_string_create(s)); + } + return array; +} + +/* raft_server. */ + +const char * +raft_server_phase_to_string(enum raft_server_phase phase) +{ + switch (phase) { + case RAFT_PHASE_STABLE: return "stable"; + case RAFT_PHASE_CATCHUP: return "adding: catchup"; + case RAFT_PHASE_CAUGHT_UP: return "adding: caught up"; + case RAFT_PHASE_COMMITTING: return "adding: committing"; + case RAFT_PHASE_REMOVE: return "removing"; + default: return ""; + } +} + +void +raft_server_destroy(struct raft_server *s) +{ + if (s) { + free(s->address); + free(s->nickname); + free(s); + } +} + +void +raft_servers_destroy(struct hmap *servers) +{ + struct raft_server *s, *next; + HMAP_FOR_EACH_SAFE (s, next, hmap_node, servers) { + hmap_remove(servers, &s->hmap_node); + raft_server_destroy(s); + } + hmap_destroy(servers); +} + +struct raft_server * +raft_server_add(struct hmap *servers, const struct uuid *sid, + const char *address) +{ + struct raft_server *s = xzalloc(sizeof *s); + s->sid = *sid; + s->address = xstrdup(address); + s->nickname = raft_address_to_nickname(address, sid); + s->phase = RAFT_PHASE_STABLE; + hmap_insert(servers, &s->hmap_node, uuid_hash(sid)); + return s; +} + + +struct raft_server * +raft_server_find(const struct hmap *servers, const struct uuid *sid) +{ + struct raft_server *s; + HMAP_FOR_EACH_IN_BUCKET (s, hmap_node, uuid_hash(sid), servers) { + if (uuid_equals(sid, &s->sid)) { + return s; + } + } + return NULL; +} + +const char * +raft_servers_get_nickname__(const struct hmap *servers, const struct uuid *sid) +{ + const struct raft_server *s = raft_server_find(servers, sid); + return s ? s->nickname : NULL; +} + +const char * +raft_servers_get_nickname(const struct hmap *servers, + const struct uuid *sid, + char buf[SID_LEN + 1], size_t bufsize) +{ + const char *s = raft_servers_get_nickname__(servers, sid); + if (s) { + return s; + } + snprintf(buf, bufsize, SID_FMT, SID_ARGS(sid)); + return buf; +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_servers_from_json__(const struct json *json, struct hmap *servers) +{ + if (!json || json->type != JSON_OBJECT) { + return ovsdb_syntax_error(json, NULL, "servers must be JSON object"); + } else if (shash_is_empty(json_object(json))) { + return ovsdb_syntax_error(json, NULL, "must have at least one server"); + } + + /* Parse new servers. */ + struct shash_node *node; + SHASH_FOR_EACH (node, json_object(json)) { + /* Parse server UUID. */ + struct uuid sid; + if (!uuid_from_string(&sid, node->name)) { + return ovsdb_syntax_error(json, NULL, "%s is a not a UUID", + node->name); + } + + const struct json *address = node->data; + struct ovsdb_error *error = raft_address_validate_json(address); + if (error) { + return error; + } + + raft_server_add(servers, &sid, json_string(address)); + } + + return NULL; +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_servers_from_json(const struct json *json, struct hmap *servers) +{ + hmap_init(servers); + struct ovsdb_error *error = raft_servers_from_json__(json, servers); + if (error) { + raft_servers_destroy(servers); + } + return error; +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_servers_validate_json(const struct json *json) +{ + struct hmap servers = HMAP_INITIALIZER(&servers); + struct ovsdb_error *error = raft_servers_from_json__(json, &servers); + raft_servers_destroy(&servers); + return error; +} + +struct json * +raft_servers_to_json(const struct hmap *servers) +{ + struct json *json = json_object_create(); + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, servers) { + char sid_s[UUID_LEN + 1]; + sprintf(sid_s, UUID_FMT, UUID_ARGS(&s->sid)); + json_object_put_string(json, sid_s, s->address); + } + return json; +} + +void +raft_servers_format(const struct hmap *servers, struct ds *ds) +{ + int i = 0; + const struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, servers) { + if (i++) { + ds_put_cstr(ds, ", "); + } + ds_put_format(ds, SID_FMT"(%s)", SID_ARGS(&s->sid), s->address); + } +} + +/* Raft log entries. */ + +void +raft_entry_clone(struct raft_entry *dst, const struct raft_entry *src) +{ + dst->term = src->term; + dst->data = json_nullable_clone(src->data); + dst->eid = src->eid; + dst->servers = json_nullable_clone(src->servers); +} + +void +raft_entry_uninit(struct raft_entry *e) +{ + if (e) { + json_destroy(e->data); + json_destroy(e->servers); + } +} + +struct json * +raft_entry_to_json(const struct raft_entry *e) +{ + struct json *json = json_object_create(); + raft_put_uint64(json, "term", e->term); + if (e->data) { + json_object_put(json, "data", json_clone(e->data)); + json_object_put_format(json, "eid", UUID_FMT, UUID_ARGS(&e->eid)); + } + if (e->servers) { + json_object_put(json, "servers", json_clone(e->servers)); + } + return json; +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_entry_from_json(struct json *json, struct raft_entry *e) +{ + memset(e, 0, sizeof *e); + + struct ovsdb_parser p; + ovsdb_parser_init(&p, json, "raft log entry"); + e->term = raft_parse_required_uint64(&p, "term"); + e->data = json_nullable_clone( + ovsdb_parser_member(&p, "data", OP_OBJECT | OP_ARRAY | OP_OPTIONAL)); + e->eid = e->data ? raft_parse_required_uuid(&p, "eid") : UUID_ZERO; + e->servers = json_nullable_clone( + ovsdb_parser_member(&p, "servers", OP_OBJECT | OP_OPTIONAL)); + if (e->servers) { + ovsdb_parser_put_error(&p, raft_servers_validate_json(e->servers)); + } + + struct ovsdb_error *error = ovsdb_parser_finish(&p); + if (error) { + raft_entry_uninit(e); + } + return error; +} + +bool +raft_entry_equals(const struct raft_entry *a, const struct raft_entry *b) +{ + return (a->term == b->term + && json_equal(a->data, b->data) + && uuid_equals(&a->eid, &b->eid) + && json_equal(a->servers, b->servers)); +} + +void +raft_header_uninit(struct raft_header *h) +{ + if (!h) { + return; + } + + free(h->name); + free(h->local_address); + sset_destroy(&h->remote_addresses); + raft_entry_uninit(&h->snap); +} + +static void +raft_header_from_json__(struct raft_header *h, struct ovsdb_parser *p) +{ + /* Parse always-required fields. */ + h->sid = raft_parse_required_uuid(p, "server_id"); + h->name = nullable_xstrdup(raft_parse_required_string(p, "name")); + h->local_address = nullable_xstrdup( + raft_parse_required_string(p, "local_address")); + + /* Parse "remotes", if present. + * + * If this is present, then this database file is for the special case of a + * server that was created with "ovsdb-tool join-cluster" and has not yet + * joined its cluster, */ + const struct json *remote_addresses + = ovsdb_parser_member(p, "remote_addresses", OP_ARRAY | OP_OPTIONAL); + h->joining = remote_addresses != NULL; + if (h->joining) { + struct ovsdb_error *error = raft_addresses_from_json( + remote_addresses, &h->remote_addresses); + if (error) { + ovsdb_parser_put_error(p, error); + } else if (sset_find_and_delete(&h->remote_addresses, h->local_address) + && sset_is_empty(&h->remote_addresses)) { + ovsdb_parser_raise_error(p, "at least one remote address (other " + "than the local address) is required"); + } + } else { + /* The set of servers is mandatory. */ + h->snap.servers = json_nullable_clone( + ovsdb_parser_member(p, "prev_servers", OP_OBJECT)); + if (h->snap.servers) { + ovsdb_parser_put_error(p, raft_servers_validate_json( + h->snap.servers)); + } + + /* Term, index, and snapshot are optional, but if any of them is + * present, all of them must be. */ + h->snap_index = raft_parse_optional_uint64(p, "prev_index"); + if (h->snap_index) { + h->snap.data = json_nullable_clone( + ovsdb_parser_member(p, "prev_data", OP_ANY)); + h->snap.eid = raft_parse_required_uuid(p, "prev_eid"); + h->snap.term = raft_parse_required_uint64(p, "prev_term"); + } + } + + /* Parse cluster ID. If we're joining a cluster, this is optional, + * otherwise it is mandatory. */ + raft_parse_uuid__(p, "cluster_id", h->joining, &h->cid); +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_header_from_json(struct raft_header *h, const struct json *json) +{ + struct ovsdb_parser p; + ovsdb_parser_init(&p, json, "raft header"); + memset(h, 0, sizeof *h); + sset_init(&h->remote_addresses); + raft_header_from_json__(h, &p); + struct ovsdb_error *error = ovsdb_parser_finish(&p); + if (error) { + raft_header_uninit(h); + } + return error; +} + +struct json * +raft_header_to_json(const struct raft_header *h) +{ + struct json *json = json_object_create(); + + json_object_put_format(json, "server_id", UUID_FMT, UUID_ARGS(&h->sid)); + if (!uuid_is_zero(&h->cid)) { + json_object_put_format(json, "cluster_id", + UUID_FMT, UUID_ARGS(&h->cid)); + } + json_object_put_string(json, "local_address", h->local_address); + json_object_put_string(json, "name", h->name); + + if (!sset_is_empty(&h->remote_addresses)) { + json_object_put(json, "remote_addresses", + raft_addresses_to_json(&h->remote_addresses)); + } + + if (h->snap.servers) { + json_object_put(json, "prev_servers", json_clone(h->snap.servers)); + } + if (h->snap_index) { + raft_put_uint64(json, "prev_index", h->snap_index); + raft_put_uint64(json, "prev_term", h->snap.term); + if (h->snap.data) { + json_object_put(json, "prev_data", json_clone(h->snap.data)); + } + json_object_put_format(json, "prev_eid", + UUID_FMT, UUID_ARGS(&h->snap.eid)); + } + + return json; +} + +void +raft_record_uninit(struct raft_record *r) +{ + if (!r) { + return; + } + + free(r->comment); + + switch (r->type) { + case RAFT_REC_ENTRY: + json_destroy(r->entry.data); + json_destroy(r->entry.servers); + break; + + case RAFT_REC_NOTE: + free(r->note); + break; + + case RAFT_REC_TERM: + case RAFT_REC_VOTE: + case RAFT_REC_COMMIT_INDEX: + case RAFT_REC_LEADER: + break; + } +} + +static void +raft_record_from_json__(struct raft_record *r, struct ovsdb_parser *p) +{ + r->comment = nullable_xstrdup(raft_parse_optional_string(p, "comment")); + + /* Parse "note". */ + const char *note = raft_parse_optional_string(p, "note"); + if (note) { + r->type = RAFT_REC_NOTE; + r->term = 0; + r->note = xstrdup(note); + return; + } + + /* Parse "commit_index". */ + r->commit_index = raft_parse_optional_uint64(p, "commit_index"); + if (r->commit_index) { + r->type = RAFT_REC_COMMIT_INDEX; + r->term = 0; + return; + } + + /* All remaining types of log records include "term", plus at most one of: + * + * - "index" plus zero or more of "data" and "servers". If "data" is + * present then "eid" may also be present. + * + * - "vote". + * + * - "leader". + */ + + /* Parse "term". + * + * A Raft leader can replicate entries from previous terms to the other + * servers in the cluster, retaining the original terms on those entries + * (see section 3.6.2 "Committing entries from previous terms" for more + * information), so it's OK for the term in a log record to precede the + * current term. */ + r->term = raft_parse_required_uint64(p, "term"); + + /* Parse "leader". */ + if (raft_parse_optional_uuid(p, "leader", &r->sid)) { + r->type = RAFT_REC_LEADER; + if (uuid_is_zero(&r->sid)) { + ovsdb_parser_raise_error(p, "record says leader is all-zeros SID"); + } + return; + } + + /* Parse "vote". */ + if (raft_parse_optional_uuid(p, "vote", &r->sid)) { + r->type = RAFT_REC_VOTE; + if (uuid_is_zero(&r->sid)) { + ovsdb_parser_raise_error(p, "record votes for all-zeros SID"); + } + return; + } + + /* If "index" is present parse the rest of the entry, otherwise it's just a + * term update. */ + r->entry.index = raft_parse_optional_uint64(p, "index"); + if (!r->entry.index) { + r->type = RAFT_REC_TERM; + } else { + r->type = RAFT_REC_ENTRY; + r->entry.servers = json_nullable_clone( + ovsdb_parser_member(p, "servers", OP_OBJECT | OP_OPTIONAL)); + if (r->entry.servers) { + ovsdb_parser_put_error( + p, raft_servers_validate_json(r->entry.servers)); + } + r->entry.data = json_nullable_clone( + ovsdb_parser_member(p, "data", + OP_OBJECT | OP_ARRAY | OP_OPTIONAL)); + r->entry.eid = (r->entry.data + ? raft_parse_required_uuid(p, "eid") + : UUID_ZERO); + } +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_record_from_json(struct raft_record *r, const struct json *json) +{ + struct ovsdb_parser p; + ovsdb_parser_init(&p, json, "raft log record"); + raft_record_from_json__(r, &p); + struct ovsdb_error *error = ovsdb_parser_finish(&p); + if (error) { + raft_record_uninit(r); + } + return error; +} + +struct json * +raft_record_to_json(const struct raft_record *r) +{ + struct json *json = json_object_create(); + + if (r->comment && *r->comment) { + json_object_put_string(json, "comment", r->comment); + } + + switch (r->type) { + case RAFT_REC_ENTRY: + raft_put_uint64(json, "term", r->term); + raft_put_uint64(json, "index", r->entry.index); + if (r->entry.data) { + json_object_put(json, "data", json_clone(r->entry.data)); + } + if (r->entry.servers) { + json_object_put(json, "servers", json_clone(r->entry.servers)); + } + if (!uuid_is_zero(&r->entry.eid)) { + json_object_put_format(json, "eid", + UUID_FMT, UUID_ARGS(&r->entry.eid)); + } + break; + + case RAFT_REC_TERM: + raft_put_uint64(json, "term", r->term); + break; + + case RAFT_REC_VOTE: + raft_put_uint64(json, "term", r->term); + json_object_put_format(json, "vote", UUID_FMT, UUID_ARGS(&r->sid)); + break; + + case RAFT_REC_NOTE: + json_object_put(json, "note", json_string_create(r->note)); + break; + + case RAFT_REC_COMMIT_INDEX: + raft_put_uint64(json, "commit_index", r->commit_index); + break; + + case RAFT_REC_LEADER: + raft_put_uint64(json, "term", r->term); + json_object_put_format(json, "leader", UUID_FMT, UUID_ARGS(&r->sid)); + break; + + default: + OVS_NOT_REACHED(); + } + return json; +} + +/* Puts 'integer' into JSON 'object' with the given 'name'. + * + * The OVS JSON implementation only supports integers in the range + * INT64_MIN...INT64_MAX, which causes trouble for values from INT64_MAX+1 to + * UINT64_MAX. We map those into the negative range. */ +void +raft_put_uint64(struct json *object, const char *name, uint64_t integer) +{ + json_object_put(object, name, json_integer_create(integer)); +} + +/* Parses an integer from parser 'p' with the given 'name'. + * + * The OVS JSON implementation only supports integers in the range + * INT64_MIN...INT64_MAX, which causes trouble for values from INT64_MAX+1 to + * UINT64_MAX. We map the negative range back into positive numbers. */ +static uint64_t +raft_parse_uint64__(struct ovsdb_parser *p, const char *name, bool optional) +{ + enum ovsdb_parser_types types = OP_INTEGER | (optional ? OP_OPTIONAL : 0); + const struct json *json = ovsdb_parser_member(p, name, types); + return json ? json_integer(json) : 0; +} + +uint64_t +raft_parse_optional_uint64(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_uint64__(p, name, true); +} + +uint64_t +raft_parse_required_uint64(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_uint64__(p, name, false); +} + +static int +raft_parse_boolean__(struct ovsdb_parser *p, const char *name, bool optional) +{ + enum ovsdb_parser_types types = OP_BOOLEAN | (optional ? OP_OPTIONAL : 0); + const struct json *json = ovsdb_parser_member(p, name, types); + return json ? json_boolean(json) : -1; +} + +bool +raft_parse_required_boolean(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_boolean__(p, name, false); +} + +/* Returns true or false if present, -1 if absent. */ +int +raft_parse_optional_boolean(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_boolean__(p, name, true); +} + +static const char * +raft_parse_string__(struct ovsdb_parser *p, const char *name, bool optional) +{ + enum ovsdb_parser_types types = OP_STRING | (optional ? OP_OPTIONAL : 0); + const struct json *json = ovsdb_parser_member(p, name, types); + return json ? json_string(json) : NULL; +} + +const char * +raft_parse_required_string(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_string__(p, name, false); +} + +const char * +raft_parse_optional_string(struct ovsdb_parser *p, const char *name) +{ + return raft_parse_string__(p, name, true); +} + +bool +raft_parse_uuid__(struct ovsdb_parser *p, const char *name, bool optional, + struct uuid *uuid) +{ + const char *s = raft_parse_string__(p, name, optional); + if (s) { + if (uuid_from_string(uuid, s)) { + return true; + } + ovsdb_parser_raise_error(p, "%s is not a valid UUID", name); + } + *uuid = UUID_ZERO; + return false; +} + +struct uuid +raft_parse_required_uuid(struct ovsdb_parser *p, const char *name) +{ + struct uuid uuid; + raft_parse_uuid__(p, name, false, &uuid); + return uuid; +} + +bool +raft_parse_optional_uuid(struct ovsdb_parser *p, const char *name, + struct uuid *uuid) +{ + return raft_parse_uuid__(p, name, true, uuid); +} + diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h new file mode 100644 index 000000000000..6e147fadb0ac --- /dev/null +++ b/ovsdb/raft-private.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RAFT_PRIVATE_H +#define RAFT_PRIVATE_H 1 + +/* Data structures for use internally within the Raft implementation. */ + +#include "raft.h" +#include +#include "openvswitch/hmap.h" +#include "openvswitch/uuid.h" +#include "sset.h" + +struct ds; +struct ovsdb_parser; + +/* Formatting server IDs and cluster IDs for use in human-readable logs. Do + * not use these in cases where the whole server or cluster ID is needed; use + * UUID_FMT and UUID_ARGS in that case.*/ + +#define SID_FMT "%04x" +#define SID_ARGS(SID) uuid_prefix(SID, 4) +#define SID_LEN 4 + +#define CID_FMT "%04x" +#define CID_ARGS(CID) uuid_prefix(CID, 4) +#define CID_LEN 4 + +struct ovsdb_error *raft_address_validate(const char *address) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *raft_address_validate_json(const struct json *address) + OVS_WARN_UNUSED_RESULT; + +struct ovsdb_error *raft_addresses_from_json(const struct json *, + struct sset *addresses) + OVS_WARN_UNUSED_RESULT; +struct json *raft_addresses_to_json(const struct sset *addresses); + +char *raft_address_to_nickname(const char *address, const struct uuid *sid); + +enum raft_server_phase { + RAFT_PHASE_STABLE, /* Not being changed. */ + + /* Phases for servers being added. */ + RAFT_PHASE_CATCHUP, /* Populating new server's log. */ + RAFT_PHASE_CAUGHT_UP, /* Waiting for prev configuration to commit. */ + RAFT_PHASE_COMMITTING, /* Waiting for new configuration to commit. */ + + /* Phases for servers to be removed. */ + RAFT_PHASE_REMOVE, /* To be removed. */ +}; + +const char *raft_server_phase_to_string(enum raft_server_phase); + +struct raft_server { + struct hmap_node hmap_node; /* Hashed based on 'sid'. */ + + struct uuid sid; /* Server ID. */ + char *address; /* "(tcp|ssl):1.2.3.4:5678" */ + char *nickname; /* 1ab3(s3) */ + + /* Volatile state on candidates. Reinitialized at start of election. */ + struct uuid vote; /* Server ID of vote, or all-zeros. */ + + /* Volatile state on leaders. Reinitialized after election. */ + uint64_t next_index; /* Index of next log entry to send this server. */ + uint64_t match_index; /* Index of max log entry server known to have. */ + enum raft_server_phase phase; + /* For use in adding and removing servers: */ + struct uuid requester_sid; /* Nonzero if requested via RPC. */ + struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ +}; + +void raft_server_destroy(struct raft_server *); +void raft_servers_destroy(struct hmap *servers); +struct raft_server *raft_server_add(struct hmap *servers, + const struct uuid *sid, + const char *address); +struct raft_server *raft_server_find(const struct hmap *servers, + const struct uuid *sid); +const char *raft_servers_get_nickname__(const struct hmap *servers, + const struct uuid *sid); +const char *raft_servers_get_nickname(const struct hmap *servers, + const struct uuid *sid, + char buf[SID_LEN + 1], size_t bufsize); +struct ovsdb_error *raft_servers_from_json(const struct json *, + struct hmap *servers) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *raft_servers_validate_json(const struct json *); + OVS_WARN_UNUSED_RESULT +struct json *raft_servers_to_json(const struct hmap *servers); +void raft_servers_format(const struct hmap *servers, struct ds *ds); + +/* A raft_entry is an in-memory data structure that represents a Raft log + * entry. */ +struct raft_entry { + uint64_t term; + struct json *data; + struct uuid eid; + struct json *servers; +}; + +void raft_entry_clone(struct raft_entry *, const struct raft_entry *); +void raft_entry_uninit(struct raft_entry *); +struct json *raft_entry_to_json(const struct raft_entry *); +struct ovsdb_error *raft_entry_from_json(struct json *, struct raft_entry *) + OVS_WARN_UNUSED_RESULT; +bool raft_entry_equals(const struct raft_entry *, const struct raft_entry *); + +/* On disk data serialization and deserialization. */ + +/* First record in a Raft log. */ +struct raft_header { + /* All servers. */ + struct uuid sid; /* Server ID. */ + struct uuid cid; /* Cluster ID. May be zero if 'joining'. */ + char *name; /* Database name. */ + char *local_address; /* Address for Raft server to listen. */ + bool joining; /* True iff cluster not joined yet. */ + + /* Only for servers that haven't joined the cluster yet. */ + struct sset remote_addresses; /* Address of other Raft servers. */ + + /* Only for servers that have joined the cluster. */ + uint64_t snap_index; /* Snapshot's index. */ + struct raft_entry snap; /* Snapshot. */ +}; + +void raft_header_uninit(struct raft_header *); +struct ovsdb_error *raft_header_from_json(struct raft_header *, + const struct json *) + OVS_WARN_UNUSED_RESULT; +struct json *raft_header_to_json(const struct raft_header *); + +enum raft_record_type { + /* Record types that match those in the Raft specification. */ + RAFT_REC_ENTRY, /* A log entry. */ + RAFT_REC_TERM, /* A new term. */ + RAFT_REC_VOTE, /* A vote. */ + + /* Extensions. */ + RAFT_REC_NOTE, /* A note about some significant event. */ + RAFT_REC_COMMIT_INDEX, /* An update to the local commit_index. */ + RAFT_REC_LEADER, /* A server has become leader for this term. */ +}; + +/* Type used for the second and subsequent records in a Raft log. */ +struct raft_record { + enum raft_record_type type; + char *comment; + + /* Valid in RAFT_REC_ENTRY, RAFT_REC_TERM, RAFT_REC_LEADER, and + * RAFT_REC_VOTE, and otherwise 0. */ + uint64_t term; + + union { + char *note; /* RAFT_REC_NOTE. */ + + uint64_t commit_index; /* RAFT_REC_COMMIT_INDEX. */ + + struct uuid sid; /* RAFT_REC_VOTE, RAFT_REC_LEADER. */ + + struct { /* RAFT_REC_ENTRY. */ + uint64_t index; + struct json *data; + struct json *servers; + struct uuid eid; + } entry; + }; +}; + +void raft_record_uninit(struct raft_record *); +struct ovsdb_error *raft_record_from_json(struct raft_record *, + const struct json *) + OVS_WARN_UNUSED_RESULT; +struct json *raft_record_to_json(const struct raft_record *); + +void raft_put_uint64(struct json *object, const char *name, uint64_t integer); +uint64_t raft_parse_optional_uint64(struct ovsdb_parser *, const char *name); +uint64_t raft_parse_required_uint64(struct ovsdb_parser *, const char *name); + +bool raft_parse_required_boolean(struct ovsdb_parser *, const char *name); +int raft_parse_optional_boolean(struct ovsdb_parser *, const char *name); +const char *raft_parse_required_string(struct ovsdb_parser *, + const char *name); +const char *raft_parse_optional_string(struct ovsdb_parser *, + const char *name); +bool raft_parse_uuid__(struct ovsdb_parser *, const char *name, bool optional, + struct uuid *); +struct uuid raft_parse_required_uuid(struct ovsdb_parser *, const char *name); +bool raft_parse_optional_uuid(struct ovsdb_parser *, const char *name, + struct uuid *); + +#endif /* raft-private.h */ diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c new file mode 100644 index 000000000000..617d4aa4eaad --- /dev/null +++ b/ovsdb/raft-rpc.c @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "raft-rpc.h" +#include +#include +#include "compiler.h" +#include "jsonrpc.h" +#include "ovsdb-error.h" +#include "ovsdb-parser.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/json.h" +#include "openvswitch/vlog.h" +#include "sset.h" + +VLOG_DEFINE_THIS_MODULE(raft_rpc); + +#define RAFT_RPC(ENUM, NAME) \ + static void raft_##NAME##_uninit(struct raft_##NAME *); \ + static void raft_##NAME##_clone(struct raft_##NAME *, \ + const struct raft_##NAME *); \ + static void raft_##NAME##_to_jsonrpc(const struct raft_##NAME *, \ + struct json *); \ + static void raft_##NAME##_from_jsonrpc(struct ovsdb_parser *, \ + struct raft_##NAME *); \ + static void raft_format_##NAME(const struct raft_##NAME *, struct ds *); +RAFT_RPC_TYPES +#undef RAFT_RPC + +/* raft_rpc_type. */ +const char * +raft_rpc_type_to_string(enum raft_rpc_type status) +{ + switch (status) { +#define RAFT_RPC(ENUM, NAME) case ENUM: return #NAME; + RAFT_RPC_TYPES +#undef RAFT_RPC + } + return ""; +} + +bool +raft_rpc_type_from_string(const char *s, enum raft_rpc_type *status) +{ +#define RAFT_RPC(ENUM, NAME) \ + if (!strcmp(s, #NAME)) { \ + *status = ENUM; \ + return true; \ + } + RAFT_RPC_TYPES +#undef RAFT_RPC + return false; +} + +/* raft_hello_request. */ + +static void +raft_hello_request_uninit(struct raft_hello_request *rq) +{ + free(rq->address); +} + +static void +raft_hello_request_clone(struct raft_hello_request *dst, + const struct raft_hello_request *src) +{ + dst->address = nullable_xstrdup(src->address); +} + +static void +raft_hello_request_to_jsonrpc(const struct raft_hello_request *rq, + struct json *args) +{ + json_object_put_string(args, "address", rq->address); +} + +static void +raft_hello_request_from_jsonrpc(struct ovsdb_parser *p, + struct raft_hello_request *rq) +{ + rq->address = nullable_xstrdup(raft_parse_required_string(p, "address")); +} + +static void +raft_format_hello_request(const struct raft_hello_request *rq, + struct ds *s) +{ + ds_put_format(s, " address=\"%s\"", rq->address); +} + +/* raft_append_request. */ + +static void +raft_append_request_uninit(struct raft_append_request *rq) +{ + for (size_t i = 0; i < rq->n_entries; i++) { + json_destroy(rq->entries[i].data); + } + free(rq->entries); +} + +static void +raft_append_request_clone(struct raft_append_request *dst, + const struct raft_append_request *src) +{ + dst->entries = xmalloc(src->n_entries * sizeof *dst->entries); + for (size_t i = 0; i < src->n_entries; i++) { + raft_entry_clone(&dst->entries[i], &src->entries[i]); + } +} + +static void +raft_append_request_to_jsonrpc(const struct raft_append_request *rq, + struct json *args) +{ + raft_put_uint64(args, "term", rq->term); + raft_put_uint64(args, "prev_log_index", rq->prev_log_index); + raft_put_uint64(args, "prev_log_term", rq->prev_log_term); + raft_put_uint64(args, "leader_commit", rq->leader_commit); + + struct json **entries = xmalloc(rq->n_entries * sizeof *entries); + for (size_t i = 0; i < rq->n_entries; i++) { + entries[i] = raft_entry_to_json(&rq->entries[i]); + } + json_object_put(args, "log", json_array_create(entries, rq->n_entries)); +} + +static void +raft_append_request_from_jsonrpc(struct ovsdb_parser *p, + struct raft_append_request *rq) +{ + rq->term = raft_parse_required_uint64(p, "term"); + rq->prev_log_index = raft_parse_required_uint64(p, "prev_log_index"); + rq->prev_log_term = raft_parse_required_uint64(p, "prev_log_term"); + rq->leader_commit = raft_parse_required_uint64(p, "leader_commit"); + + const struct json *log = ovsdb_parser_member(p, "log", OP_ARRAY); + if (!log) { + return; + } + const struct json_array *entries = json_array(log); + rq->entries = xmalloc(entries->n * sizeof *rq->entries); + rq->n_entries = 0; + for (size_t i = 0; i < entries->n; i++) { + struct ovsdb_error *error = raft_entry_from_json(entries->elems[i], + &rq->entries[i]); + if (error) { + ovsdb_parser_put_error(p, error); + break; + } + rq->n_entries++; + } +} + +static void +raft_format_append_request(const struct raft_append_request *rq, + struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rq->term); + ds_put_format(s, " prev_log_index=%"PRIu64, rq->prev_log_index); + ds_put_format(s, " prev_log_term=%"PRIu64, rq->prev_log_term); + ds_put_format(s, " leader_commit=%"PRIu64, rq->leader_commit); + ds_put_format(s, " n_entries=%u", rq->n_entries); +} + +/* raft_append_reply. */ + +const char * +raft_append_result_to_string(enum raft_append_result result) +{ + switch (result) { + case RAFT_APPEND_OK: + return "OK"; + case RAFT_APPEND_INCONSISTENCY: + return "inconsistency"; + case RAFT_APPEND_IO_ERROR: + return "I/O error"; + default: + return NULL; + } +} + +bool +raft_append_result_from_string(const char *s, enum raft_append_result *resultp) +{ + for (enum raft_append_result result = 0; ; result++) { + const char *s2 = raft_append_result_to_string(result); + if (!s2) { + *resultp = 0; + return false; + } else if (!strcmp(s, s2)) { + *resultp = result; + return true; + } + } +} + +static void +raft_append_reply_uninit(struct raft_append_reply *rpy OVS_UNUSED) +{ +} + +static void +raft_append_reply_clone(struct raft_append_reply *dst OVS_UNUSED, + const struct raft_append_reply *src OVS_UNUSED) +{ +} + +static void +raft_append_reply_to_jsonrpc(const struct raft_append_reply *rpy, + struct json *args) +{ + raft_put_uint64(args, "term", rpy->term); + raft_put_uint64(args, "log_end", rpy->log_end); + raft_put_uint64(args, "prev_log_index", rpy->prev_log_index); + raft_put_uint64(args, "prev_log_term", rpy->prev_log_term); + raft_put_uint64(args, "n_entries", rpy->n_entries); + json_object_put_string(args, "result", + raft_append_result_to_string(rpy->result)); +} + +static void +raft_append_reply_from_jsonrpc(struct ovsdb_parser *p, + struct raft_append_reply *rpy) +{ + rpy->term = raft_parse_required_uint64(p, "term"); + rpy->log_end = raft_parse_required_uint64(p, "log_end"); + rpy->prev_log_index = raft_parse_required_uint64(p, "prev_log_index"); + rpy->prev_log_term = raft_parse_required_uint64(p, "prev_log_term"); + rpy->n_entries = raft_parse_required_uint64(p, "n_entries"); + + const char *result = raft_parse_required_string(p, "result"); + if (result && !raft_append_result_from_string(result, &rpy->result)) { + ovsdb_parser_raise_error(p, "unknown result \"%s\"", result); + } +} + +static void +raft_format_append_reply(const struct raft_append_reply *rpy, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rpy->term); + ds_put_format(s, " log_end=%"PRIu64, rpy->log_end); + ds_put_format(s, " result=\"%s\"", + raft_append_result_to_string(rpy->result)); +} + +/* raft_vote_request. */ + +static void +raft_vote_request_uninit(struct raft_vote_request *rq OVS_UNUSED) +{ +} + +static void +raft_vote_request_clone(struct raft_vote_request *dst OVS_UNUSED, + const struct raft_vote_request *src OVS_UNUSED) +{ +} + +static void +raft_vote_request_to_jsonrpc(const struct raft_vote_request *rq, + struct json *args) +{ + raft_put_uint64(args, "term", rq->term); + raft_put_uint64(args, "last_log_index", rq->last_log_index); + raft_put_uint64(args, "last_log_term", rq->last_log_term); + if (rq->leadership_transfer) { + json_object_put(args, "leadership_transfer", + json_boolean_create(true)); + } +} + +static void +raft_vote_request_from_jsonrpc(struct ovsdb_parser *p, + struct raft_vote_request *rq) +{ + rq->term = raft_parse_required_uint64(p, "term"); + rq->last_log_index = raft_parse_required_uint64(p, "last_log_index"); + rq->last_log_term = raft_parse_required_uint64(p, "last_log_term"); + rq->leadership_transfer + = raft_parse_optional_boolean(p, "leadership_transfer") == 1; +} + +static void +raft_format_vote_request(const struct raft_vote_request *rq, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rq->term); + ds_put_format(s, " last_log_index=%"PRIu64, rq->last_log_index); + ds_put_format(s, " last_log_term=%"PRIu64, rq->last_log_term); +} + +/* raft_vote_reply. */ + +static void +raft_vote_reply_uninit(struct raft_vote_reply *rpy OVS_UNUSED) +{ +} + +static void +raft_vote_reply_clone(struct raft_vote_reply *dst OVS_UNUSED, + const struct raft_vote_reply *src OVS_UNUSED) +{ +} + +static void +raft_vote_reply_to_jsonrpc(const struct raft_vote_reply *rpy, + struct json *args) +{ + raft_put_uint64(args, "term", rpy->term); + json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote)); +} + +static void +raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p, + struct raft_vote_reply *rpy) +{ + rpy->term = raft_parse_required_uint64(p, "term"); + rpy->vote = raft_parse_required_uuid(p, "vote"); +} + +static void +raft_format_vote_reply(const struct raft_vote_reply *rpy, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rpy->term); + ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote)); +} + +/* raft_add_server_request */ + +static void +raft_add_server_request_uninit(struct raft_add_server_request *rq) +{ + free(rq->address); +} + +static void +raft_add_server_request_clone(struct raft_add_server_request *dst, + const struct raft_add_server_request *src) +{ + dst->address = nullable_xstrdup(src->address); +} + +static void +raft_add_server_request_to_jsonrpc(const struct raft_add_server_request *rq, + struct json *args) +{ + json_object_put_string(args, "address", rq->address); +} + +static void +raft_add_server_request_from_jsonrpc(struct ovsdb_parser *p, + struct raft_add_server_request *rq) +{ + rq->address = nullable_xstrdup(raft_parse_required_string(p, "address")); +} + +static void +raft_format_add_server_request(const struct raft_add_server_request *rq, + struct ds *s) +{ + ds_put_format(s, " address=\"%s\"", rq->address); +} + +/* raft_add_server_reply. */ + +static void +raft_add_server_reply_uninit(struct raft_add_server_reply *rpy) +{ + sset_destroy(&rpy->remote_addresses); +} + +static void +raft_add_server_reply_clone(struct raft_add_server_reply *dst, + const struct raft_add_server_reply *src) +{ + sset_clone(&dst->remote_addresses, &src->remote_addresses); +} + +static void +raft_add_server_reply_to_jsonrpc(const struct raft_add_server_reply *rpy, + struct json *args) +{ + json_object_put(args, "success", json_boolean_create(rpy->success)); + if (!sset_is_empty(&rpy->remote_addresses)) { + json_object_put(args, "remote_addresses", + raft_addresses_to_json(&rpy->remote_addresses)); + } +} + +static void +raft_add_server_reply_from_jsonrpc(struct ovsdb_parser *p, + struct raft_add_server_reply *rpy) +{ + rpy->success = raft_parse_required_boolean(p, "success"); + + const struct json *json = ovsdb_parser_member(p, "remote_addresses", + OP_ARRAY | OP_OPTIONAL); + if (json) { + ovsdb_parser_put_error(p, raft_addresses_from_json( + json, &rpy->remote_addresses)); + } else { + sset_init(&rpy->remote_addresses); + } +} + +static void +raft_format_add_server_reply(const struct raft_add_server_reply *rpy, + struct ds *s) +{ + ds_put_format(s, " success=%s", rpy->success ? "true" : "false"); + if (!sset_is_empty(&rpy->remote_addresses)) { + ds_put_cstr(s, " remote_addresses=["); + + const char *address; + int i = 0; + SSET_FOR_EACH (address, &rpy->remote_addresses) { + if (i++ > 0) { + ds_put_cstr(s, ", "); + } + ds_put_cstr(s, address); + } + ds_put_char(s, ']'); + } +} + +/* raft_remove_server_reply. */ + +static void +raft_remove_server_reply_uninit( + struct raft_remove_server_reply *rpy OVS_UNUSED) +{ +} + +static void +raft_remove_server_reply_clone( + struct raft_remove_server_reply *dst OVS_UNUSED, + const struct raft_remove_server_reply *src OVS_UNUSED) +{ +} + +static void +raft_remove_server_reply_to_jsonrpc(const struct raft_remove_server_reply *rpy, + struct json *args) +{ + json_object_put(args, "success", json_boolean_create(rpy->success)); +} + +static void +raft_remove_server_reply_from_jsonrpc(struct ovsdb_parser *p, + struct raft_remove_server_reply *rpy) +{ + rpy->success = raft_parse_required_boolean(p, "success"); +} + +static void +raft_format_remove_server_reply(const struct raft_remove_server_reply *rpy, + struct ds *s) +{ + ds_put_format(s, " success=%s", rpy->success ? "true" : "false"); +} + +/* raft_install_snapshot_request. */ + +static void +raft_install_snapshot_request_uninit( + struct raft_install_snapshot_request *rq) +{ + json_destroy(rq->last_servers); + json_destroy(rq->data); +} + +static void +raft_install_snapshot_request_clone( + struct raft_install_snapshot_request *dst, + const struct raft_install_snapshot_request *src) +{ + dst->last_servers = json_nullable_clone(src->last_servers); + dst->data = json_nullable_clone(src->data); +} + +static void +raft_install_snapshot_request_to_jsonrpc( + const struct raft_install_snapshot_request *rq, struct json *args) +{ + raft_put_uint64(args, "term", rq->term); + raft_put_uint64(args, "last_index", rq->last_index); + raft_put_uint64(args, "last_term", rq->last_term); + json_object_put(args, "last_servers", json_clone(rq->last_servers)); + json_object_put_format(args, "last_eid", + UUID_FMT, UUID_ARGS(&rq->last_eid)); + + json_object_put(args, "data", json_clone(rq->data)); +} + +static void +raft_install_snapshot_request_from_jsonrpc( + struct ovsdb_parser *p, struct raft_install_snapshot_request *rq) +{ + rq->last_servers = json_nullable_clone( + ovsdb_parser_member(p, "last_servers", OP_OBJECT)); + ovsdb_parser_put_error(p, raft_servers_validate_json(rq->last_servers)); + + rq->term = raft_parse_required_uint64(p, "term"); + rq->last_index = raft_parse_required_uint64(p, "last_index"); + rq->last_term = raft_parse_required_uint64(p, "last_term"); + rq->last_eid = raft_parse_required_uuid(p, "last_eid"); + + rq->data = json_nullable_clone( + ovsdb_parser_member(p, "data", OP_OBJECT | OP_ARRAY)); +} + +static void +raft_format_install_snapshot_request( + const struct raft_install_snapshot_request *rq, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rq->term); + ds_put_format(s, " last_index=%"PRIu64, rq->last_index); + ds_put_format(s, " last_term=%"PRIu64, rq->last_term); + ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid)); + ds_put_cstr(s, " last_servers="); + + struct hmap servers; + struct ovsdb_error *error = + raft_servers_from_json(rq->last_servers, &servers); + if (!error) { + raft_servers_format(&servers, s); + raft_servers_destroy(&servers); + } else { + ds_put_cstr(s, "***error***"); + ovsdb_error_destroy(error); + } +} + +/* raft_install_snapshot_reply. */ + +static void +raft_install_snapshot_reply_uninit( + struct raft_install_snapshot_reply *rpy OVS_UNUSED) +{ +} + +static void +raft_install_snapshot_reply_clone( + struct raft_install_snapshot_reply *dst OVS_UNUSED, + const struct raft_install_snapshot_reply *src OVS_UNUSED) +{ +} + +static void +raft_install_snapshot_reply_to_jsonrpc( + const struct raft_install_snapshot_reply *rpy, struct json *args) +{ + raft_put_uint64(args, "term", rpy->term); + raft_put_uint64(args, "last_index", rpy->last_index); + raft_put_uint64(args, "last_term", rpy->last_term); +} + +static void +raft_install_snapshot_reply_from_jsonrpc( + struct ovsdb_parser *p, + struct raft_install_snapshot_reply *rpy) +{ + rpy->term = raft_parse_required_uint64(p, "term"); + rpy->last_index = raft_parse_required_uint64(p, "last_index"); + rpy->last_term = raft_parse_required_uint64(p, "last_term"); +} + +static void +raft_format_install_snapshot_reply( + const struct raft_install_snapshot_reply *rpy, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rpy->term); +} + +/* raft_remove_server_request. */ + +static void +raft_remove_server_request_uninit( + struct raft_remove_server_request *rq OVS_UNUSED) +{ +} + +static void +raft_remove_server_request_clone( + struct raft_remove_server_request *dst OVS_UNUSED, + const struct raft_remove_server_request *src OVS_UNUSED) +{ +} + +static void +raft_remove_server_request_to_jsonrpc( + const struct raft_remove_server_request *rq, struct json *args) +{ + json_object_put_format(args, "server_id", UUID_FMT, UUID_ARGS(&rq->sid)); +} + +static void +raft_remove_server_request_from_jsonrpc(struct ovsdb_parser *p, + struct raft_remove_server_request *rq) +{ + rq->sid = raft_parse_required_uuid(p, "server_id"); +} + +static void +raft_format_remove_server_request(const struct raft_remove_server_request *rq, + struct ds *s) +{ + ds_put_format(s, " server="SID_FMT, SID_ARGS(&rq->sid)); +} + +/* raft_become_leader. */ + +static void +raft_become_leader_uninit(struct raft_become_leader *rpc OVS_UNUSED) +{ +} + +static void +raft_become_leader_clone(struct raft_become_leader *dst OVS_UNUSED, + const struct raft_become_leader *src OVS_UNUSED) +{ +} + +static void +raft_become_leader_to_jsonrpc(const struct raft_become_leader *rpc, + struct json *args) +{ + raft_put_uint64(args, "term", rpc->term); +} + +static void +raft_become_leader_from_jsonrpc(struct ovsdb_parser *p, + struct raft_become_leader *rpc) +{ + rpc->term = raft_parse_required_uint64(p, "term"); +} + +static void +raft_format_become_leader(const struct raft_become_leader *rq, struct ds *s) +{ + ds_put_format(s, " term=%"PRIu64, rq->term); +} + +/* raft_execute_command_request. */ + +static void +raft_execute_command_request_uninit( + struct raft_execute_command_request *rq) +{ + json_destroy(rq->data); +} + +static void +raft_execute_command_request_clone( + struct raft_execute_command_request *dst, + const struct raft_execute_command_request *src) +{ + dst->data = json_nullable_clone(src->data); +} + +static void +raft_execute_command_request_to_jsonrpc( + const struct raft_execute_command_request *rq, struct json *args) +{ + json_object_put(args, "data", json_clone(rq->data)); + json_object_put_format(args, "prereq", UUID_FMT, UUID_ARGS(&rq->prereq)); + json_object_put_format(args, "result", UUID_FMT, UUID_ARGS(&rq->result)); +} + +static void +raft_execute_command_request_from_jsonrpc( + struct ovsdb_parser *p, struct raft_execute_command_request *rq) +{ + rq->data = json_nullable_clone(ovsdb_parser_member(p, "data", + OP_OBJECT | OP_ARRAY)); + rq->prereq = raft_parse_required_uuid(p, "prereq"); + rq->result = raft_parse_required_uuid(p, "result"); +} + +static void +raft_format_execute_command_request( + const struct raft_execute_command_request *rq, struct ds *s) +{ + ds_put_format(s, " prereq="UUID_FMT, UUID_ARGS(&rq->prereq)); + ds_put_format(s, " result="UUID_FMT, UUID_ARGS(&rq->result)); + ds_put_format(s, " data="); + json_to_ds(rq->data, JSSF_SORT, s); +} + +/* raft_execute_command_reply. */ + +static void +raft_execute_command_reply_uninit( + struct raft_execute_command_reply *rpy OVS_UNUSED) +{ +} + +static void +raft_execute_command_reply_clone( + struct raft_execute_command_reply *dst OVS_UNUSED, + const struct raft_execute_command_reply *src OVS_UNUSED) +{ +} + +static void +raft_execute_command_reply_to_jsonrpc( + const struct raft_execute_command_reply *rpy, struct json *args) +{ + json_object_put_format(args, "result", UUID_FMT, UUID_ARGS(&rpy->result)); + json_object_put_string(args, "status", + raft_command_status_to_string(rpy->status)); + if (rpy->commit_index) { + raft_put_uint64(args, "commit_index", rpy->commit_index); + } +} + +static void +raft_execute_command_reply_from_jsonrpc( + struct ovsdb_parser *p, struct raft_execute_command_reply *rpy) +{ + rpy->result = raft_parse_required_uuid(p, "result"); + + const char *status = raft_parse_required_string(p, "status"); + if (status && !raft_command_status_from_string(status, &rpy->status)) { + ovsdb_parser_raise_error(p, "unknown status \"%s\"", status); + } + + rpy->commit_index = raft_parse_optional_uint64(p, "commit_index"); +} + +static void +raft_format_execute_command_reply( + const struct raft_execute_command_reply *rpy, struct ds *s) +{ + ds_put_format(s, " result="UUID_FMT, UUID_ARGS(&rpy->result)); + ds_put_format(s, " status=\"%s\"", + raft_command_status_to_string(rpy->status)); + if (rpy->commit_index) { + ds_put_format(s, " commit_index=%"PRIu64, rpy->commit_index); + } +} + +void +raft_rpc_uninit(union raft_rpc *rpc) +{ + if (rpc) { + free(rpc->common.comment); + + switch (rpc->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_##NAME##_uninit(&rpc->NAME); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + } + } +} + +union raft_rpc * +raft_rpc_clone(const union raft_rpc *src) +{ + union raft_rpc *dst = xmemdup(src, sizeof *src); + dst->common.comment = nullable_xstrdup(src->common.comment); + + switch (src->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_##NAME##_clone(&dst->NAME, &src->NAME); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + } + + return dst; +} + +struct jsonrpc_msg * +raft_rpc_to_jsonrpc(const struct uuid *cid, + const struct uuid *sid, + const union raft_rpc *rpc) +{ + struct json *args = json_object_create(); + if (!uuid_is_zero(cid)) { + json_object_put_format(args, "cluster", UUID_FMT, UUID_ARGS(cid)); + } + if (!uuid_is_zero(&rpc->common.sid)) { + json_object_put_format(args, "to", UUID_FMT, + UUID_ARGS(&rpc->common.sid)); + } + json_object_put_format(args, "from", UUID_FMT, UUID_ARGS(sid)); + if (rpc->common.comment) { + json_object_put_string(args, "comment", rpc->common.comment); + } + + switch (rpc->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_##NAME##_to_jsonrpc(&rpc->NAME, args); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + default: + OVS_NOT_REACHED(); + } + + return jsonrpc_create_notify(raft_rpc_type_to_string(rpc->type), + json_array_create_1(args)); +} + +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_rpc_from_jsonrpc(struct uuid *cidp, + const struct uuid *sid, + const struct jsonrpc_msg *msg, + union raft_rpc *rpc) +{ + memset(rpc, 0, sizeof *rpc); + if (msg->type != JSONRPC_NOTIFY) { + return ovsdb_error(NULL, "expecting notify RPC but received %s", + jsonrpc_msg_type_to_string(msg->type)); + } + + if (!raft_rpc_type_from_string(msg->method, &rpc->type)) { + return ovsdb_error(NULL, "unknown method %s", msg->method); + } + + if (json_array(msg->params)->n != 1) { + return ovsdb_error(NULL, + "%s RPC has %"PRIuSIZE" parameters (expected 1)", + msg->method, json_array(msg->params)->n); + } + + struct ovsdb_parser p; + ovsdb_parser_init(&p, json_array(msg->params)->elems[0], + "raft %s RPC", msg->method); + + bool is_hello = rpc->type == RAFT_RPC_HELLO_REQUEST; + bool is_add = rpc->type == RAFT_RPC_ADD_SERVER_REQUEST; + + struct uuid cid; + if (raft_parse_uuid__(&p, "cluster", is_add, &cid) + && !uuid_equals(&cid, cidp)) { + if (uuid_is_zero(cidp)) { + *cidp = cid; + VLOG_INFO("learned cluster ID "CID_FMT, CID_ARGS(&cid)); + } else { + ovsdb_parser_raise_error(&p, "wrong cluster "CID_FMT" " + "(expected "CID_FMT")", + CID_ARGS(&cid), CID_ARGS(cidp)); + } + } + + struct uuid to_sid; + if (raft_parse_uuid__(&p, "to", is_add || is_hello, &to_sid) + && !uuid_equals(&to_sid, sid)) { + ovsdb_parser_raise_error(&p, "misrouted message (addressed to " + SID_FMT" but we're "SID_FMT")", + SID_ARGS(&to_sid), SID_ARGS(sid)); + } + + rpc->common.sid = raft_parse_required_uuid(&p, "from"); + rpc->common.comment = nullable_xstrdup( + raft_parse_optional_string(&p, "comment")); + + switch (rpc->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_##NAME##_from_jsonrpc(&p, &rpc->NAME); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + + default: + OVS_NOT_REACHED(); + } + + struct ovsdb_error *error = ovsdb_parser_finish(&p); + if (error) { + raft_rpc_uninit(rpc); + } + return error; +} + +void +raft_rpc_format(const union raft_rpc *rpc, struct ds *s) +{ + ds_put_format(s, "%s", raft_rpc_type_to_string(rpc->type)); + if (rpc->common.comment) { + ds_put_format(s, " \"%s\"", rpc->common.comment); + } + ds_put_char(s, ':'); + + switch (rpc->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_format_##NAME(&rpc->NAME, s); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + default: + OVS_NOT_REACHED(); + } +} + +uint64_t +raft_rpc_get_term(const union raft_rpc *rpc) +{ + switch (rpc->type) { + case RAFT_RPC_HELLO_REQUEST: + case RAFT_RPC_ADD_SERVER_REQUEST: + case RAFT_RPC_ADD_SERVER_REPLY: + case RAFT_RPC_REMOVE_SERVER_REQUEST: + case RAFT_RPC_REMOVE_SERVER_REPLY: + case RAFT_RPC_EXECUTE_COMMAND_REQUEST: + case RAFT_RPC_EXECUTE_COMMAND_REPLY: + return 0; + + case RAFT_RPC_APPEND_REQUEST: + return rpc->append_request.term; + + case RAFT_RPC_APPEND_REPLY: + return rpc->append_reply.term; + + case RAFT_RPC_VOTE_REQUEST: + return rpc->vote_request.term; + + case RAFT_RPC_VOTE_REPLY: + return rpc->vote_reply.term; + + case RAFT_RPC_INSTALL_SNAPSHOT_REQUEST: + return rpc->install_snapshot_request.term; + + case RAFT_RPC_INSTALL_SNAPSHOT_REPLY: + return rpc->install_snapshot_reply.term; + + case RAFT_RPC_BECOME_LEADER: + return rpc->become_leader.term; + + default: + OVS_NOT_REACHED(); + } +} + +const struct uuid * +raft_rpc_get_vote(const union raft_rpc *rpc) +{ + switch (rpc->type) { + case RAFT_RPC_HELLO_REQUEST: + case RAFT_RPC_ADD_SERVER_REQUEST: + case RAFT_RPC_ADD_SERVER_REPLY: + case RAFT_RPC_REMOVE_SERVER_REQUEST: + case RAFT_RPC_REMOVE_SERVER_REPLY: + case RAFT_RPC_EXECUTE_COMMAND_REQUEST: + case RAFT_RPC_EXECUTE_COMMAND_REPLY: + case RAFT_RPC_APPEND_REQUEST: + case RAFT_RPC_APPEND_REPLY: + case RAFT_RPC_VOTE_REQUEST: + case RAFT_RPC_INSTALL_SNAPSHOT_REQUEST: + case RAFT_RPC_INSTALL_SNAPSHOT_REPLY: + case RAFT_RPC_BECOME_LEADER: + return NULL; + + case RAFT_RPC_VOTE_REPLY: + return &raft_vote_reply_cast(rpc)->vote; + + default: + OVS_NOT_REACHED(); + } +} + +/* Returns the minimum log index that must be synced to disk if 'rpc' is to be + * sent. (This is generally the biggest log index in the message but some + * messages, e.g. RAFT_RPC_APPEND_REQUEST, don't need their entries synced.) */ +uint64_t +raft_rpc_get_min_sync_index(const union raft_rpc *rpc) +{ + switch (rpc->type) { + case RAFT_RPC_HELLO_REQUEST: + case RAFT_RPC_ADD_SERVER_REQUEST: + case RAFT_RPC_ADD_SERVER_REPLY: + case RAFT_RPC_REMOVE_SERVER_REQUEST: + case RAFT_RPC_REMOVE_SERVER_REPLY: + case RAFT_RPC_EXECUTE_COMMAND_REQUEST: + case RAFT_RPC_EXECUTE_COMMAND_REPLY: + case RAFT_RPC_APPEND_REQUEST: + case RAFT_RPC_BECOME_LEADER: + case RAFT_RPC_VOTE_REPLY: + return 0; + + case RAFT_RPC_APPEND_REPLY: + return raft_append_reply_cast(rpc)->log_end - 1; + + case RAFT_RPC_VOTE_REQUEST: + return raft_vote_request_cast(rpc)->last_log_index; + + case RAFT_RPC_INSTALL_SNAPSHOT_REQUEST: + return raft_install_snapshot_request_cast(rpc)->last_index; + + case RAFT_RPC_INSTALL_SNAPSHOT_REPLY: + /* XXX This will need to change if install_snapshot_reply becomes able + * to report an error */ + return raft_install_snapshot_reply_cast(rpc)->last_index; + + default: + OVS_NOT_REACHED(); + } +} diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h new file mode 100644 index 000000000000..dd95aa1201f9 --- /dev/null +++ b/ovsdb/raft-rpc.h @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RAFT_RPC_H +#define RAFT_RPC_H 1 + +/* Data structures used internally by Raft implementation for JSON-RPC. */ + +#include +#include +#include "openvswitch/uuid.h" +#include "raft.h" +#include "raft-private.h" +#include "sset.h" + +struct ds; + +#define RAFT_RPC_TYPES \ + /* Hello RPC. */ \ + RAFT_RPC(RAFT_RPC_HELLO_REQUEST, hello_request) \ + \ + /* AppendEntries RPC. */ \ + RAFT_RPC(RAFT_RPC_APPEND_REQUEST, append_request) \ + RAFT_RPC(RAFT_RPC_APPEND_REPLY, append_reply) \ + \ + /* RequestVote RPC. */ \ + RAFT_RPC(RAFT_RPC_VOTE_REQUEST, vote_request) \ + RAFT_RPC(RAFT_RPC_VOTE_REPLY, vote_reply) \ + \ + /* AddServer RPC. */ \ + RAFT_RPC(RAFT_RPC_ADD_SERVER_REQUEST, add_server_request) \ + RAFT_RPC(RAFT_RPC_ADD_SERVER_REPLY, add_server_reply) \ + \ + /* RemoveServer RPC. */ \ + RAFT_RPC(RAFT_RPC_REMOVE_SERVER_REQUEST, remove_server_request) \ + RAFT_RPC(RAFT_RPC_REMOVE_SERVER_REPLY, remove_server_reply) \ + \ + /* InstallSnapshot RPC. */ \ + RAFT_RPC(RAFT_RPC_INSTALL_SNAPSHOT_REQUEST, install_snapshot_request) \ + RAFT_RPC(RAFT_RPC_INSTALL_SNAPSHOT_REPLY, install_snapshot_reply) \ + \ + /* BecomeLeader RPC. */ \ + RAFT_RPC(RAFT_RPC_BECOME_LEADER, become_leader) \ + \ + /* ExecuteCommand RPC. */ \ + RAFT_RPC(RAFT_RPC_EXECUTE_COMMAND_REQUEST, execute_command_request) \ + RAFT_RPC(RAFT_RPC_EXECUTE_COMMAND_REPLY, execute_command_reply) + +enum raft_rpc_type { +#define RAFT_RPC(ENUM, NAME) ENUM, + RAFT_RPC_TYPES +#undef RAFT_RPC +}; + +const char *raft_rpc_type_to_string(enum raft_rpc_type); +bool raft_rpc_type_from_string(const char *, enum raft_rpc_type *); + +struct raft_rpc_common { + enum raft_rpc_type type; + struct uuid sid; /* SID of peer server. */ + char *comment; +}; + +struct raft_hello_request { + struct raft_rpc_common common; + char *address; /* Sender's address. */ +}; + +struct raft_append_request { + struct raft_rpc_common common; + uint64_t term; /* Leader's term. */ + uint64_t prev_log_index; /* Log entry just before new ones. */ + uint64_t prev_log_term; /* Term of prev_log_index entry. */ + uint64_t leader_commit; /* Leader's commit_index. */ + + /* The append request includes 0 or more log entries. entries[0] is for + * log entry 'prev_log_index + 1', and so on. + * + * A heartbeat append_request has no terms. */ + struct raft_entry *entries; + unsigned int n_entries; +}; + +enum raft_append_result { + RAFT_APPEND_OK, /* Success. */ + RAFT_APPEND_INCONSISTENCY, /* Failure due to log inconsistency. */ + RAFT_APPEND_IO_ERROR, /* Failure due to I/O error. */ +}; + +const char *raft_append_result_to_string(enum raft_append_result); +bool raft_append_result_from_string(const char *, enum raft_append_result *); + +struct raft_append_reply { + struct raft_rpc_common common; + + /* Copied from the state machine of the reply's sender. */ + uint64_t term; /* Current term, for leader to update itself. */ + uint64_t log_end; /* To allow capping next_index, see 4.2.1. */ + + /* Copied from request. */ + uint64_t prev_log_index; /* Log entry just before new ones. */ + uint64_t prev_log_term; /* Term of prev_log_index entry. */ + unsigned int n_entries; + + /* Result. */ + enum raft_append_result result; +}; + +struct raft_vote_request { + struct raft_rpc_common common; + uint64_t term; /* Candidate's term. */ + uint64_t last_log_index; /* Index of candidate's last log entry. */ + uint64_t last_log_term; /* Term of candidate's last log entry. */ + bool leadership_transfer; /* True to override minimum election timeout. */ +}; + +struct raft_vote_reply { + struct raft_rpc_common common; + uint64_t term; /* Current term, for candidate to update itself. */ + struct uuid vote; /* Server ID of vote. */ +}; + +struct raft_add_server_request { + struct raft_rpc_common common; + char *address; /* Address of new server. */ +}; + +struct raft_remove_server_request { + struct raft_rpc_common common; + struct uuid sid; /* Server to remove. */ + + /* Nonnull if request was received via unixctl. */ + struct unixctl_conn *requester_conn; +}; + +/* The operation committed and is now complete. */ +#define RAFT_SERVER_COMPLETED "completed" + +/* The operation could not be initiated because this server is not the current + * leader. Only the leader can add or remove servers. */ +#define RAFT_SERVER_NOT_LEADER "not leader" + +/* An operation to add a server succeeded without any change because the server + * was already part of the cluster. */ +#define RAFT_SERVER_ALREADY_PRESENT "already in cluster" + +/* An operation to remove a server succeeded without any change because the + * server was not part of the cluster. */ +#define RAFT_SERVER_ALREADY_GONE "already not in cluster" + +/* The operation could not be initiated because an identical + * operation was already in progress. */ +#define RAFT_SERVER_IN_PROGRESS "in progress" + +/* Adding a server failed because of a timeout. This could mean that the + * server was entirely unreachable, or that it became unreachable partway + * through populating it with an initial copy of the log. In the latter case, + * retrying the operation should resume where it left off. */ +#define RAFT_SERVER_TIMEOUT "timeout" + +/* The operation was initiated but it later failed because this server lost + * cluster leadership. The operation may be retried against the new cluster + * leader. For adding a server, if the log was already partially copied to the + * new server, retrying the operation should resume where it left off. */ +#define RAFT_SERVER_LOST_LEADERSHIP "lost leadership" + +/* Adding a server was canceled by submission of an operation to remove the + * same server, or removing a server was canceled by submission of an operation + * to add the same server. */ +#define RAFT_SERVER_CANCELED "canceled" + +/* Adding or removing a server could not be initiated because the operation to + * remove or add the server, respectively, has been logged but not committed. + * The new operation may be retried once the former operation commits. */ +#define RAFT_SERVER_COMMITTING "committing" + +/* Adding or removing a server was canceled because the leader shut down. */ +#define RAFT_SERVER_SHUTDOWN "shutdown" + +/* Removing a server could not be initiated because, taken together with any + * other scheduled server removals, the cluster would be empty. (This + * calculation ignores scheduled or uncommitted add server operations because + * of the possibility that they could fail.) */ +#define RAFT_SERVER_EMPTY "empty" + +struct raft_add_server_reply { + struct raft_rpc_common common; + bool success; + struct sset remote_addresses; +}; + +struct raft_remove_server_reply { + struct raft_rpc_common common; + bool success; +}; + +struct raft_install_snapshot_request { + struct raft_rpc_common common; + + uint64_t term; /* Leader's term. */ + + uint64_t last_index; /* Covers everything up & including this. */ + uint64_t last_term; /* Term of last_index. */ + struct uuid last_eid; /* Last entry ID. */ + struct json *last_servers; + + /* Data. */ + struct json *data; +}; + +struct raft_install_snapshot_reply { + struct raft_rpc_common common; + + uint64_t term; /* For leader to update itself. */ + + /* Repeated from the install_snapshot request. */ + uint64_t last_index; + uint64_t last_term; +}; + +struct raft_become_leader { + struct raft_rpc_common common; + + uint64_t term; /* Leader's term. */ +}; + +struct raft_execute_command_request { + struct raft_rpc_common common; + + struct json *data; + struct uuid prereq; + struct uuid result; +}; + +struct raft_execute_command_reply { + struct raft_rpc_common common; + + struct uuid result; + enum raft_command_status status; + uint64_t commit_index; +}; + +union raft_rpc { + enum raft_rpc_type type; + struct raft_rpc_common common; +#define RAFT_RPC(ENUM, NAME) struct raft_##NAME NAME; + RAFT_RPC_TYPES +#undef RAFT_RPC +}; + +#define RAFT_RPC(ENUM, NAME) \ + static inline const struct raft_##NAME * \ + raft_##NAME##_cast(const union raft_rpc *rpc) \ + { \ + ovs_assert(rpc->type == ENUM); \ + return &rpc->NAME; \ + } +RAFT_RPC_TYPES +#undef RAFT_RPC + +void raft_rpc_uninit(union raft_rpc *); +union raft_rpc *raft_rpc_clone(const union raft_rpc *); + +struct jsonrpc_msg *raft_rpc_to_jsonrpc(const struct uuid *cid, + const struct uuid *sid, + const union raft_rpc *); +struct ovsdb_error *raft_rpc_from_jsonrpc(struct uuid *cid, + const struct uuid *sid, + const struct jsonrpc_msg *, + union raft_rpc *) + OVS_WARN_UNUSED_RESULT; + +void raft_rpc_format(const union raft_rpc *, struct ds *); + +uint64_t raft_rpc_get_term(const union raft_rpc *); +const struct uuid *raft_rpc_get_vote(const union raft_rpc *); +uint64_t raft_rpc_get_min_sync_index(const union raft_rpc *); + +#endif /* lib/raft-rpc.h */ diff --git a/ovsdb/raft.c b/ovsdb/raft.c new file mode 100644 index 000000000000..9c358bf35687 --- /dev/null +++ b/ovsdb/raft.c @@ -0,0 +1,4321 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "raft.h" +#include "raft-private.h" + +#include +#include + +#include "hash.h" +#include "jsonrpc.h" +#include "lockfile.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/hmap.h" +#include "openvswitch/json.h" +#include "openvswitch/list.h" +#include "openvswitch/poll-loop.h" +#include "openvswitch/vlog.h" +#include "ovsdb-error.h" +#include "ovsdb-parser.h" +#include "ovsdb/log.h" +#include "raft-rpc.h" +#include "random.h" +#include "socket-util.h" +#include "stream.h" +#include "timeval.h" +#include "unicode.h" +#include "unixctl.h" +#include "util.h" +#include "uuid.h" + +VLOG_DEFINE_THIS_MODULE(raft); + +/* Roles for a Raft server: + * + * - Followers: Servers in touch with the current leader. + * + * - Candidate: Servers unaware of a current leader and seeking election to + * leader. + * + * - Leader: Handles all client requests. At most one at a time. + * + * In normal operation there is exactly one leader and all of the other servers + * are followers. */ +enum raft_role { + RAFT_FOLLOWER, + RAFT_CANDIDATE, + RAFT_LEADER +}; + +struct raft_conn { + struct ovs_list list_node; + struct jsonrpc_session *js; + struct uuid sid; + char *nickname; + bool incoming; /* True if incoming, false if outgoing. */ + + /* Join. */ + unsigned int js_seqno; +}; + +/* A "command", that is, a request to append an entry to the log. + * + * The Raft specification only allows clients to issue commands to the leader. + * With this implementation, clients may issue a command on any server, which + * then relays the command to the leader if necessary. + * + * This structure is thus used in three cases: + * + * 1. We are the leader and the command was issued to us directly. + * + * 2. We are a follower and relayed the command to the leader. + * + * 3. We are the leader and a follower relayed the command to us. + */ +struct raft_command { + /* All cases. */ + struct hmap_node hmap_node; /* In struct raft's 'commands' hmap. */ + unsigned int n_refs; /* Reference count. */ + enum raft_command_status status; /* Execution status. */ + + /* Case 1 only. */ + uint64_t index; /* Index in log (0 if being relayed). */ + + /* Cases 2 and 3. */ + struct uuid eid; /* Entry ID of result. */ + + /* Case 2 only. */ + long long int timestamp; /* Issue or last ping time, for expiration. */ + + /* Case 3 only. */ + struct uuid sid; /* The follower (otherwise UUID_ZERO). */ +}; + +static void raft_command_complete(struct raft *, struct raft_command *, + enum raft_command_status); + +static void raft_complete_all_commands(struct raft *, + enum raft_command_status); + +/* Type of deferred action, see struct raft_waiter. */ +enum raft_waiter_type { + RAFT_W_ENTRY, + RAFT_W_TERM, + RAFT_W_RPC, +}; + +/* An action deferred until a log write commits to disk. */ +struct raft_waiter { + struct ovs_list list_node; + uint64_t commit_ticket; + + enum raft_waiter_type type; + union { + /* RAFT_W_ENTRY. + * + * Waits for a RAFT_REC_ENTRY write to our local log to commit. Upon + * completion, updates log_synced to indicate that the new log entry or + * entries are committed and, if we are leader, also updates our local + * match_index. */ + struct { + uint64_t index; + } entry; + + /* RAFT_W_TERM. + * + * Waits for a RAFT_REC_TERM or RAFT_REC_VOTE record write to commit. + * Upon completion, updates synced_term and synced_vote, which triggers + * sending RPCs deferred by the uncommitted term and vote. */ + struct { + uint64_t term; + struct uuid vote; + } term; + + /* RAFT_W_RPC. + * + * Sometimes, sending an RPC to a peer must be delayed until an entry, + * a term, or a vote mentioned in the RPC is synced to disk. This + * waiter keeps a copy of such an RPC until the previous waiters have + * committed. */ + union raft_rpc *rpc; + }; +}; + +static struct raft_waiter *raft_waiter_create(struct raft *, + enum raft_waiter_type, + bool start_commit); + +/* The Raft state machine. */ +struct raft { + struct hmap_node hmap_node; /* In 'all_rafts'. */ + struct ovsdb_log *log; + +/* Persistent derived state. + * + * This must be updated on stable storage before responding to RPCs. It can be + * derived from the header, snapshot, and log in 'log'. */ + + struct uuid cid; /* Cluster ID (immutable for the cluster). */ + struct uuid sid; /* Server ID (immutable for the server). */ + char *local_address; /* Local address (immutable for the server). */ + char *local_nickname; /* Used for local server in log messages. */ + char *name; /* Cluster name (immutable for the cluster). */ + + /* Contains "struct raft_server"s and represents the server configuration + * most recently added to 'log'. */ + struct hmap servers; + +/* Persistent state on all servers. + * + * Must be updated on stable storage before responding to RPCs. */ + + /* Current term and vote, which might be on the way to disk now. */ + uint64_t term; /* Initialized to 0 and only increases. */ + struct uuid vote; /* In 'term', or all-zeros if none. */ + + /* The term and vote that have been synced to disk. */ + uint64_t synced_term; + struct uuid synced_vote; + + /* The log. + * + * A log entry with index 1 never really exists; the initial snapshot for a + * Raft is considered to include this index. The first real log entry has + * index 2. + * + * A new Raft instance contains an empty log: log_start=2, log_end=2. + * Over time, the log grows: log_start=2, log_end=N. + * At some point, the server takes a snapshot: log_start=N, log_end=N. + * The log continues to grow: log_start=N, log_end=N+1... + * + * Must be updated on stable storage before responding to RPCs. */ + struct raft_entry *entries; /* Log entry i is in log[i - log_start]. */ + uint64_t log_start; /* Index of first entry in log. */ + uint64_t log_end; /* Index of last entry in log, plus 1. */ + uint64_t log_synced; /* Index of last synced entry. */ + size_t allocated_log; /* Allocated entries in 'log'. */ + + /* Snapshot state (see Figure 5.1) + * + * This is the state of the cluster as of the last discarded log entry, + * that is, at log index 'log_start - 1' (called prevIndex in Figure 5.1). + * Only committed log entries can be included in a snapshot. */ + struct raft_entry snap; + +/* Volatile state. + * + * The snapshot is always committed, but the rest of the log might not be yet. + * 'last_applied' tracks what entries have been passed to the client. If the + * client hasn't yet read the latest snapshot, then even the snapshot isn't + * applied yet. Thus, the invariants are different for these members: + * + * log_start - 2 <= last_applied <= commit_index < log_end. + * log_start - 1 <= commit_index < log_end. + */ + + enum raft_role role; /* Current role. */ + uint64_t commit_index; /* Max log index known to be committed. */ + uint64_t last_applied; /* Max log index applied to state machine. */ + struct uuid leader_sid; /* Server ID of leader (zero, if unknown). */ + + /* Followers and candidates only. */ +#define ELECTION_BASE_MSEC 1024 +#define ELECTION_RANGE_MSEC 1024 + long long int election_base; /* Time of last heartbeat from leader. */ + long long int election_timeout; /* Time at which we start an election. */ + + /* Used for joining a cluster. */ + bool joining; /* Attempting to join the cluster? */ + struct sset remote_addresses; /* Addresses to try to find other servers. */ + long long int join_timeout; /* Time to re-send add server request. */ + + /* Used for leaving a cluster. */ + bool leaving; + bool left; + long long int leave_timeout; /* Time to re-send remove server request. */ + + /* Failure. */ + bool failed; + + /* File synchronization. */ + struct ovs_list waiters; /* Contains "struct raft_waiter"s. */ + + /* Network connections. */ + struct pstream *listener; + long long int listen_backoff; + struct ovs_list conns; + + /* Leaders only. Reinitialized after becoming leader. */ + struct hmap add_servers; /* Contains "struct raft_server"s to add. */ + struct raft_server *remove_server; /* Server being removed. */ + struct hmap commands; /* Contains "struct raft_command"s. */ +#define PING_TIME_MSEC (ELECTION_BASE_MSEC / 3) + long long int ping_timeout; /* Time at which to send a heartbeat */ + + /* Candidates only. Reinitialized at start of election. */ + int n_votes; /* Number of votes for me. */ +}; + +/* All Raft structures. */ +static struct hmap all_rafts = HMAP_INITIALIZER(&all_rafts); + +static void raft_init(void); + +static struct ovsdb_error *raft_read_header(struct raft *) + OVS_WARN_UNUSED_RESULT; + +static void raft_send_execute_command_reply(struct raft *, + const struct uuid *sid, + const struct uuid *eid, + enum raft_command_status, + uint64_t commit_index); + +static void raft_update_our_match_index(struct raft *, uint64_t min_index); + +static void raft_send_remove_server_reply__( + struct raft *, const struct uuid *target_sid, + const struct uuid *requester_sid, struct unixctl_conn *requester_conn, + bool success, const char *comment); + +static void raft_server_init_leader(struct raft *, struct raft_server *); + +static bool raft_rpc_is_heartbeat(const union raft_rpc *); +static bool raft_is_rpc_synced(const struct raft *, const union raft_rpc *); + +static void raft_handle_rpc(struct raft *, const union raft_rpc *); +static bool raft_send(struct raft *, const union raft_rpc *); +static bool raft_send__(struct raft *, const union raft_rpc *, + struct raft_conn *); +static void raft_send_append_request(struct raft *, + struct raft_server *, unsigned int n, + const char *comment); + +static void raft_become_leader(struct raft *); +static void raft_become_follower(struct raft *); +static void raft_reset_timer(struct raft *); +static void raft_send_heartbeats(struct raft *); +static void raft_start_election(struct raft *, bool leadership_transfer); +static bool raft_truncate(struct raft *, uint64_t new_end); +static void raft_get_servers_from_log(struct raft *, enum vlog_level); + +static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); + +static void raft_run_reconfigure(struct raft *); + +static struct raft_server * +raft_find_server(const struct raft *raft, const struct uuid *sid) +{ + return raft_server_find(&raft->servers, sid); +} + +static char * +raft_make_address_passive(const char *address_) +{ + if (!strncmp(address_, "unix:", 5)) { + return xasprintf("p%s", address_); + } else { + char *address = xstrdup(address_); + char *p = strchr(address, ':') + 1; + char *host = inet_parse_token(&p); + char *port = inet_parse_token(&p); + + struct ds paddr = DS_EMPTY_INITIALIZER; + ds_put_format(&paddr, "p%.3s:%s:", address, port); + if (strchr(host, ':')) { + ds_put_format(&paddr, "[%s]", host); + } else { + ds_put_cstr(&paddr, host); + } + free(address); + return ds_steal_cstr(&paddr); + } +} + +static struct raft * +raft_alloc(void) +{ + raft_init(); + + struct raft *raft = xzalloc(sizeof *raft); + hmap_node_nullify(&raft->hmap_node); + hmap_init(&raft->servers); + raft->log_start = raft->log_end = 1; + raft->role = RAFT_FOLLOWER; + sset_init(&raft->remote_addresses); + raft->join_timeout = LLONG_MAX; + ovs_list_init(&raft->waiters); + raft->listen_backoff = LLONG_MIN; + ovs_list_init(&raft->conns); + hmap_init(&raft->add_servers); + hmap_init(&raft->commands); + + raft->ping_timeout = time_msec() + PING_TIME_MSEC; + raft_reset_timer(raft); + + return raft; +} + +/* Creates an on-disk file that represents a new Raft cluster and initializes + * it to consist of a single server, the one on which this function is called. + * + * Creates the local copy of the cluster's log in 'file_name', which must not + * already exist. Gives it the name 'name', which should be the database + * schema name and which is used only to match up this database with server + * added to the cluster later if the cluster ID is unavailable. + * + * The new server is located at 'local_address', which must take one of the + * forms "tcp:IP[:PORT]" or "ssl:IP[:PORT]", where IP is an IPv4 address or a + * square bracket enclosed IPv6 address. PORT, if present, is a port number + * that defaults to RAFT_PORT. + * + * This only creates the on-disk file. Use raft_open() to start operating the + * new server. + * + * Returns null if successful, otherwise an ovsdb_error describing the + * problem. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_create_cluster(const char *file_name, const char *name, + const char *local_address, const struct json *data) +{ + /* Parse and verify validity of the local address. */ + struct ovsdb_error *error = raft_address_validate(local_address); + if (error) { + return error; + } + + /* Create log file. */ + struct ovsdb_log *log; + error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL, + -1, &log); + if (error) { + return error; + } + + /* Write log file. */ + struct raft_header h = { + .sid = uuid_random(), + .cid = uuid_random(), + .name = xstrdup(name), + .local_address = xstrdup(local_address), + .joining = false, + .remote_addresses = SSET_INITIALIZER(&h.remote_addresses), + .snap_index = 1, + .snap = { + .term = 1, + .data = json_nullable_clone(data), + .eid = uuid_random(), + .servers = json_object_create(), + }, + }; + shash_add_nocopy(json_object(h.snap.servers), + xasprintf(UUID_FMT, UUID_ARGS(&h.sid)), + json_string_create(local_address)); + error = ovsdb_log_write_and_free(log, raft_header_to_json(&h)); + raft_header_uninit(&h); + if (!error) { + error = ovsdb_log_commit_block(log); + } + ovsdb_log_close(log); + + return error; +} + +/* Creates a database file that represents a new server in an existing Raft + * cluster. + * + * Creates the local copy of the cluster's log in 'file_name', which must not + * already exist. Gives it the name 'name', which must be the same name + * passed in to raft_create_cluster() earlier. + * + * 'cid' is optional. If specified, the new server will join only the cluster + * with the given cluster ID. + * + * The new server is located at 'local_address', which must take one of the + * forms "tcp:IP[:PORT]" or "ssl:IP[:PORT]", where IP is an IPv4 address or a + * square bracket enclosed IPv6 address. PORT, if present, is a port number + * that defaults to RAFT_PORT. + * + * Joining the cluster requiring contacting it. Thus, 'remote_addresses' + * specifies the addresses of existing servers in the cluster. One server out + * of the existing cluster is sufficient, as long as that server is reachable + * and not partitioned from the current cluster leader. If multiple servers + * from the cluster are specified, then it is sufficient for any of them to + * meet this criterion. + * + * This only creates the on-disk file and does no network access. Use + * raft_open() to start operating the new server. (Until this happens, the + * new server has not joined the cluster.) + * + * Returns null if successful, otherwise an ovsdb_error describing the + * problem. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_join_cluster(const char *file_name, + const char *name, const char *local_address, + const struct sset *remote_addresses, + const struct uuid *cid) +{ + ovs_assert(!sset_is_empty(remote_addresses)); + + /* Parse and verify validity of the addresses. */ + struct ovsdb_error *error = raft_address_validate(local_address); + if (error) { + return error; + } + const char *addr; + SSET_FOR_EACH (addr, remote_addresses) { + error = raft_address_validate(addr); + if (error) { + return error; + } + if (!strcmp(addr, local_address)) { + return ovsdb_error(NULL, "remote addresses cannot be the same " + "as the local address"); + } + } + + /* Verify validity of the cluster ID (if provided). */ + if (cid && uuid_is_zero(cid)) { + return ovsdb_error(NULL, "all-zero UUID is not valid cluster ID"); + } + + /* Create log file. */ + struct ovsdb_log *log; + error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL, + -1, &log); + if (error) { + return error; + } + + /* Write log file. */ + struct raft_header h = { + .sid = uuid_random(), + .cid = cid ? *cid : UUID_ZERO, + .name = xstrdup(name), + .local_address = xstrdup(local_address), + .joining = true, + }; + sset_clone(&h.remote_addresses, remote_addresses); + error = ovsdb_log_write_and_free(log, raft_header_to_json(&h)); + raft_header_uninit(&h); + if (!error) { + error = ovsdb_log_commit_block(log); + } + ovsdb_log_close(log); + + return error; +} + +/* Reads the initial header record from 'log', which must be a Raft clustered + * database log, and populates '*md' with the information read from it. The + * caller must eventually destroy 'md'. + * + * On success, returns NULL. On failure, returns a error that the caller must + * eventually destroy and zeros '*md'. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_read_metadata(struct ovsdb_log *log, struct raft_metadata *md) +{ + struct raft *raft = raft_alloc(); + raft->log = log; + + struct ovsdb_error *error = raft_read_header(raft); + if (!error) { + md->sid = raft->sid; + md->name = xstrdup(raft->name); + md->local = xstrdup(raft->local_address); + md->cid = raft->cid; + } else { + memset(md, 0, sizeof *md); + } + + raft->log = NULL; + raft_close(raft); + return error; +} + +/* Frees the metadata in 'md'. */ +void +raft_metadata_destroy(struct raft_metadata *md) +{ + if (md) { + free(md->name); + free(md->local); + } +} + +static const struct raft_entry * +raft_get_entry(const struct raft *raft, uint64_t index) +{ + ovs_assert(index >= raft->log_start); + ovs_assert(index < raft->log_end); + return &raft->entries[index - raft->log_start]; +} + +static uint64_t +raft_get_term(const struct raft *raft, uint64_t index) +{ + return (index == raft->log_start - 1 + ? raft->snap.term + : raft_get_entry(raft, index)->term); +} + +static struct json * +raft_servers_for_index(const struct raft *raft, uint64_t index) +{ + ovs_assert(index >= raft->log_start - 1); + ovs_assert(index < raft->log_end); + + const struct json *servers = raft->snap.servers; + for (uint64_t i = raft->log_start; i <= index; i++) { + const struct raft_entry *e = raft_get_entry(raft, i); + if (e->servers) { + servers = e->servers; + } + } + return json_clone(servers); +} + +static void +raft_set_servers(struct raft *raft, const struct hmap *new_servers, + enum vlog_level level) +{ + struct raft_server *s, *next; + HMAP_FOR_EACH_SAFE (s, next, hmap_node, &raft->servers) { + if (!raft_server_find(new_servers, &s->sid)) { + ovs_assert(s != raft->remove_server); + + hmap_remove(&raft->servers, &s->hmap_node); + VLOG(level, "server %s removed from configuration", s->nickname); + raft_server_destroy(s); + } + } + + HMAP_FOR_EACH_SAFE (s, next, hmap_node, new_servers) { + if (!raft_find_server(raft, &s->sid)) { + VLOG(level, "server %s added to configuration", s->nickname); + + struct raft_server *new + = raft_server_add(&raft->servers, &s->sid, s->address); + raft_server_init_leader(raft, new); + } + } +} + +static uint64_t +raft_add_entry(struct raft *raft, + uint64_t term, struct json *data, const struct uuid *eid, + struct json *servers) +{ + if (raft->log_end - raft->log_start >= raft->allocated_log) { + raft->entries = x2nrealloc(raft->entries, &raft->allocated_log, + sizeof *raft->entries); + } + + uint64_t index = raft->log_end++; + struct raft_entry *entry = &raft->entries[index - raft->log_start]; + entry->term = term; + entry->data = data; + entry->eid = eid ? *eid : UUID_ZERO; + entry->servers = servers; + return index; +} + +/* Writes a RAFT_REC_ENTRY record for 'term', 'data', 'eid', 'servers' to + * 'raft''s log and returns an error indication. */ +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_write_entry(struct raft *raft, uint64_t term, struct json *data, + const struct uuid *eid, struct json *servers) +{ + struct raft_record r = { + .type = RAFT_REC_ENTRY, + .term = term, + .entry = { + .index = raft_add_entry(raft, term, data, eid, servers), + .data = data, + .servers = servers, + .eid = eid ? *eid : UUID_ZERO, + }, + }; + return ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)); +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_write_state(struct ovsdb_log *log, + uint64_t term, const struct uuid *vote) +{ + struct raft_record r = { .term = term }; + if (vote && !uuid_is_zero(vote)) { + r.type = RAFT_REC_VOTE; + r.sid = *vote; + } else { + r.type = RAFT_REC_TERM; + } + return ovsdb_log_write_and_free(log, raft_record_to_json(&r)); +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_apply_record(struct raft *raft, unsigned long long int rec_idx, + const struct raft_record *r) +{ + /* Apply "term", which is present in most kinds of records (and otherwise + * 0). + * + * A Raft leader can replicate entries from previous terms to the other + * servers in the cluster, retaining the original terms on those entries + * (see section 3.6.2 "Committing entries from previous terms" for more + * information), so it's OK for the term in a log record to precede the + * current term. */ + if (r->term > raft->term) { + raft->term = raft->synced_term = r->term; + raft->vote = raft->synced_vote = UUID_ZERO; + } + + switch (r->type) { + case RAFT_REC_ENTRY: + if (r->entry.index < raft->commit_index) { + return ovsdb_error(NULL, "record %llu attempts to truncate log " + "from %"PRIu64" to %"PRIu64" entries, but " + "commit index is already %"PRIu64, + rec_idx, raft->log_end, r->entry.index, + raft->commit_index); + } else if (r->entry.index > raft->log_end) { + return ovsdb_error(NULL, "record %llu with index %"PRIu64" skips " + "past expected index %"PRIu64, + rec_idx, r->entry.index, raft->log_end); + } + + if (r->entry.index < raft->log_end) { + /* This can happen, but it is notable. */ + VLOG_DBG("record %llu truncates log from %"PRIu64" to %"PRIu64 + " entries", rec_idx, raft->log_end, r->entry.index); + raft_truncate(raft, r->entry.index); + } + + uint64_t prev_term = (raft->log_end > raft->log_start + ? raft->entries[raft->log_end + - raft->log_start - 1].term + : raft->snap.term); + if (r->term < prev_term) { + return ovsdb_error(NULL, "record %llu with index %"PRIu64" term " + "%"PRIu64" precedes previous entry's term " + "%"PRIu64, + rec_idx, r->entry.index, r->term, prev_term); + } + + raft->log_synced = raft_add_entry( + raft, r->term, + json_nullable_clone(r->entry.data), &r->entry.eid, + json_nullable_clone(r->entry.servers)); + return NULL; + + case RAFT_REC_TERM: + return NULL; + + case RAFT_REC_VOTE: + if (r->term < raft->term) { + return ovsdb_error(NULL, "record %llu votes for term %"PRIu64" " + "but current term is %"PRIu64, + rec_idx, r->term, raft->term); + } else if (!uuid_is_zero(&raft->vote) + && !uuid_equals(&raft->vote, &r->sid)) { + return ovsdb_error(NULL, "record %llu votes for "SID_FMT" in term " + "%"PRIu64" but a previous record for the " + "same term voted for "SID_FMT, rec_idx, + SID_ARGS(&raft->vote), r->term, + SID_ARGS(&r->sid)); + } else { + raft->vote = raft->synced_vote = r->sid; + return NULL; + } + break; + + case RAFT_REC_NOTE: + if (!strcmp(r->note, "left")) { + return ovsdb_error(NULL, "record %llu indicates server has left " + "the cluster; it cannot be added back (use " + "\"ovsdb-tool join-cluster\" to add a new " + "server)", rec_idx); + } + return NULL; + + case RAFT_REC_COMMIT_INDEX: + if (r->commit_index < raft->commit_index) { + return ovsdb_error(NULL, "record %llu regresses commit index " + "from %"PRIu64 " to %"PRIu64, + rec_idx, raft->commit_index, r->commit_index); + } else if (r->commit_index >= raft->log_end) { + return ovsdb_error(NULL, "record %llu advances commit index to " + "%"PRIu64 " but last log index is %"PRIu64, + rec_idx, r->commit_index, raft->log_end - 1); + } else { + raft->commit_index = r->commit_index; + return NULL; + } + break; + + case RAFT_REC_LEADER: + /* XXX we could use this to take back leadership for quick restart */ + return NULL; + + default: + OVS_NOT_REACHED(); + } +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_read_header(struct raft *raft) +{ + /* Read header record. */ + struct json *json; + struct ovsdb_error *error = ovsdb_log_read(raft->log, &json); + if (error || !json) { + /* Report error or end-of-file. */ + return error; + } + ovsdb_log_mark_base(raft->log); + + struct raft_header h; + error = raft_header_from_json(&h, json); + if (error) { + return error; + } + + raft->sid = h.sid; + raft->cid = h.cid; + raft->name = xstrdup(h.name); + raft->local_address = xstrdup(h.local_address); + raft->local_nickname = raft_address_to_nickname(h.local_address, &h.sid); + raft->joining = h.joining; + + if (h.joining) { + sset_clone(&raft->remote_addresses, &h.remote_addresses); + } else { + raft_entry_clone(&raft->snap, &h.snap); + raft->log_start = raft->log_end = h.snap_index + 1; + raft->commit_index = h.snap_index; + raft->last_applied = h.snap_index - 1; + } + + raft_header_uninit(&h); + + return NULL; +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_read_log(struct raft *raft) +{ + for (unsigned long long int i = 1; ; i++) { + struct json *json; + struct ovsdb_error *error = ovsdb_log_read(raft->log, &json); + if (!json) { + if (error) { + /* We assume that the error is due to a partial write while + * appending to the file before a crash, so log it and + * continue. */ + char *error_string = ovsdb_error_to_string_free(error); + VLOG_WARN("%s", error_string); + free(error_string); + error = NULL; + } + break; + } + + struct raft_record r; + error = raft_record_from_json(&r, json); + if (!error) { + error = raft_apply_record(raft, i, &r); + raft_record_uninit(&r); + } + if (error) { + return ovsdb_wrap_error(error, "error reading record %llu from " + "%s log", i, raft->name); + } + } + + /* Set the most recent servers. */ + raft_get_servers_from_log(raft, VLL_DBG); + + return NULL; +} + +static void +raft_reset_timer(struct raft *raft) +{ + unsigned int duration = (ELECTION_BASE_MSEC + + random_range(ELECTION_RANGE_MSEC)); + raft->election_base = time_msec(); + raft->election_timeout = raft->election_base + duration; +} + +static void +raft_add_conn(struct raft *raft, struct jsonrpc_session *js, + const struct uuid *sid, bool incoming) +{ + struct raft_conn *conn = xzalloc(sizeof *conn); + ovs_list_push_back(&raft->conns, &conn->list_node); + conn->js = js; + if (sid) { + conn->sid = *sid; + } + conn->nickname = raft_address_to_nickname(jsonrpc_session_get_name(js), + &conn->sid); + conn->incoming = incoming; + conn->js_seqno = jsonrpc_session_get_seqno(conn->js); +} + +/* Starts the local server in an existing Raft cluster, using the local copy of + * the cluster's log in 'file_name'. Takes ownership of 'log', whether + * successful or not. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_open(struct ovsdb_log *log, struct raft **raftp) +{ + struct raft *raft = raft_alloc(); + raft->log = log; + + struct ovsdb_error *error = raft_read_header(raft); + if (error) { + goto error; + } + + if (!raft->joining) { + error = raft_read_log(raft); + if (error) { + goto error; + } + + /* Find our own server. */ + if (!raft_find_server(raft, &raft->sid)) { + error = ovsdb_error(NULL, "server does not belong to cluster"); + goto error; + } + + /* If there's only one server, start an election right away so that the + * cluster bootstraps quickly. */ + if (hmap_count(&raft->servers) == 1) { + raft_start_election(raft, false); + } + } else { + raft->join_timeout = time_msec() + 1000; + } + + *raftp = raft; + hmap_insert(&all_rafts, &raft->hmap_node, hash_string(raft->name, 0)); + return NULL; + +error: + raft_close(raft); + *raftp = NULL; + return error; +} + +/* Returns the name of 'raft', which in OVSDB is the database schema name. */ +const char * +raft_get_name(const struct raft *raft) +{ + return raft->name; +} + +/* Returns the cluster ID of 'raft'. If 'raft' has not yet completed joining + * its cluster, then 'cid' will be all-zeros (unless the administrator + * specified a cluster ID running "ovsdb-tool join-cluster"). + * + * Each cluster has a unique cluster ID. */ +const struct uuid * +raft_get_cid(const struct raft *raft) +{ + return &raft->cid; +} + +/* Returns the server ID of 'raft'. Each server has a unique server ID. */ +const struct uuid * +raft_get_sid(const struct raft *raft) +{ + return &raft->sid; +} + +/* Returns true if 'raft' has completed joining its cluster, has not left or + * initiated leaving the cluster, does not have failed disk storage, and is + * apparently connected to the leader in a healthy way (or is itself the + * leader).*/ +bool +raft_is_connected(const struct raft *raft) +{ + return (raft->role != RAFT_CANDIDATE + && !raft->joining + && !raft->leaving + && !raft->left + && !raft->failed); +} + +/* Returns true if 'raft' is the cluster leader. */ +bool +raft_is_leader(const struct raft *raft) +{ + return raft->role == RAFT_LEADER; +} + +/* Returns true if 'raft' is the process of joining its cluster. */ +bool +raft_is_joining(const struct raft *raft) +{ + return raft->joining; +} + +/* Only returns *connected* connections. */ +static struct raft_conn * +raft_find_conn_by_sid(struct raft *raft, const struct uuid *sid) +{ + if (!uuid_is_zero(sid)) { + struct raft_conn *conn; + LIST_FOR_EACH (conn, list_node, &raft->conns) { + if (uuid_equals(sid, &conn->sid) + && jsonrpc_session_is_connected(conn->js)) { + return conn; + } + } + } + return NULL; +} + +static struct raft_conn * +raft_find_conn_by_address(struct raft *raft, const char *address) +{ + struct raft_conn *conn; + LIST_FOR_EACH (conn, list_node, &raft->conns) { + if (!strcmp(jsonrpc_session_get_name(conn->js), address)) { + return conn; + } + } + return NULL; +} + +static void OVS_PRINTF_FORMAT(3, 4) +raft_record_note(struct raft *raft, const char *note, + const char *comment_format, ...) +{ + va_list args; + va_start(args, comment_format); + char *comment = xvasprintf(comment_format, args); + va_end(args); + + struct raft_record r = { + .type = RAFT_REC_NOTE, + .comment = comment, + .note = CONST_CAST(char *, note), + }; + ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); + + free(comment); +} + +/* If we're leader, try to transfer leadership to another server, logging + * 'reason' as the human-readable reason (it should be a phrase suitable for + * following "because") . */ +void +raft_transfer_leadership(struct raft *raft, const char *reason) +{ + if (raft->role != RAFT_LEADER) { + return; + } + + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (!uuid_equals(&raft->sid, &s->sid) + && s->phase == RAFT_PHASE_STABLE) { + struct raft_conn *conn = raft_find_conn_by_sid(raft, &s->sid); + if (!conn) { + continue; + } + + union raft_rpc rpc = { + .become_leader = { + .common = { + .comment = CONST_CAST(char *, reason), + .type = RAFT_RPC_BECOME_LEADER, + .sid = s->sid, + }, + .term = raft->term, + } + }; + raft_send__(raft, &rpc, conn); + + raft_record_note(raft, "transfer leadership", + "transferring leadership to %s because %s", + s->nickname, reason); + break; + } + } +} + +/* Send a RemoveServerRequest to the rest of the servers in the cluster. + * + * If we know which server is the leader, we can just send the request to it. + * However, we might not know which server is the leader, and we might never + * find out if the remove request was actually previously committed by a + * majority of the servers (because in that case the new leader will not send + * AppendRequests or heartbeats to us). Therefore, we instead send + * RemoveRequests to every server. This theoretically has the same problem, if + * the current cluster leader was not previously a member of the cluster, but + * it seems likely to be more robust in practice. */ +static void +raft_send_remove_server_requests(struct raft *raft) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "sending remove request (joining=%s, leaving=%s)", + raft->joining ? "true" : "false", + raft->leaving ? "true" : "false"); + const struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (!uuid_equals(&s->sid, &raft->sid)) { + union raft_rpc rpc = (union raft_rpc) { + .remove_server_request = { + .common = { + .type = RAFT_RPC_REMOVE_SERVER_REQUEST, + .sid = s->sid, + }, + .sid = raft->sid, + }, + }; + raft_send(raft, &rpc); + } + } + + raft->leave_timeout = time_msec() + ELECTION_BASE_MSEC; +} + +/* Attempts to start 'raft' leaving its cluster. The caller can check progress + * using raft_is_leaving() and raft_left(). */ +void +raft_leave(struct raft *raft) +{ + if (raft->joining || raft->failed || raft->leaving || raft->left) { + return; + } + VLOG_INFO(SID_FMT": starting to leave cluster "CID_FMT, + SID_ARGS(&raft->sid), CID_ARGS(&raft->cid)); + raft->leaving = true; + raft_transfer_leadership(raft, "this server is leaving the cluster"); + raft_become_follower(raft); + raft_send_remove_server_requests(raft); + raft->leave_timeout = time_msec() + ELECTION_BASE_MSEC; + raft->leaving = true; +} + +/* Returns true if 'raft' is currently attempting to leave its cluster. */ +bool +raft_is_leaving(const struct raft *raft) +{ + return raft->leaving; +} + +/* Returns true if 'raft' is successfully left its cluster. */ +bool +raft_left(const struct raft *raft) +{ + return raft->left; +} + +/* Returns true if 'raft' has experienced a disk I/O failure. When this + * returns true, only closing and reopening 'raft' allows for recovery. */ +bool +raft_failed(const struct raft *raft) +{ + return raft->failed; +} + +/* Forces 'raft' to attempt to take leadership of the cluster by deposing the + * current cluster. */ +void +raft_take_leadership(struct raft *raft) +{ + if (raft->role != RAFT_LEADER) { + raft_start_election(raft, true); + } +} + +static void +raft_close__(struct raft *raft) +{ + if (!hmap_node_is_null(&raft->hmap_node)) { + hmap_remove(&all_rafts, &raft->hmap_node); + hmap_node_nullify(&raft->hmap_node); + } + + raft_complete_all_commands(raft, RAFT_CMD_SHUTDOWN); + + struct raft_server *rs = raft->remove_server; + if (rs) { + raft_send_remove_server_reply__(raft, &rs->sid, &rs->requester_sid, + rs->requester_conn, false, + RAFT_SERVER_SHUTDOWN); + raft_server_destroy(raft->remove_server); + } + + struct raft_conn *conn, *next; + LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) { + jsonrpc_session_close(conn->js); + ovs_list_remove(&conn->list_node); + free(conn->nickname); + free(conn); + } +} + +/* Closes and frees 'raft'. + * + * A server's cluster membership is independent of whether the server is + * actually running. When a server that is a member of a cluster closes, the + * cluster treats this as a server failure. */ +void +raft_close(struct raft *raft) +{ + if (!raft) { + return; + } + + raft_transfer_leadership(raft, "this server is shutting down"); + + raft_close__(raft); + + ovsdb_log_close(raft->log); + + raft_servers_destroy(&raft->servers); + + for (uint64_t index = raft->log_start; index < raft->log_end; index++) { + struct raft_entry *e = &raft->entries[index - raft->log_start]; + raft_entry_uninit(e); + } + free(raft->entries); + + raft_entry_uninit(&raft->snap); + + raft_servers_destroy(&raft->add_servers); + + sset_destroy(&raft->remote_addresses); + free(raft->local_address); + free(raft->local_nickname); + free(raft->name); + + free(raft); +} + +static bool +raft_conn_receive(struct raft *raft, struct raft_conn *conn, + union raft_rpc *rpc) +{ + struct jsonrpc_msg *msg = jsonrpc_session_recv(conn->js); + if (!msg) { + return false; + } + + struct ovsdb_error *error = raft_rpc_from_jsonrpc(&raft->cid, &raft->sid, + msg, rpc); + jsonrpc_msg_destroy(msg); + if (error) { + char *s = ovsdb_error_to_string_free(error); + VLOG_INFO("%s: %s", jsonrpc_session_get_name(conn->js), s); + free(s); + return false; + } + + if (uuid_is_zero(&conn->sid)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(50, 50); + conn->sid = rpc->common.sid; + VLOG_INFO_RL(&rl, "%s: learned server ID "SID_FMT, + jsonrpc_session_get_name(conn->js), SID_ARGS(&conn->sid)); + } else if (!uuid_equals(&conn->sid, &rpc->common.sid)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, + "%s: remote server ID changed from "SID_FMT" to "SID_FMT, + jsonrpc_session_get_name(conn->js), + SID_ARGS(&conn->sid), SID_ARGS(&rpc->common.sid)); + } + + const char *address = (rpc->type == RAFT_RPC_HELLO_REQUEST + ? rpc->hello_request.address + : rpc->type == RAFT_RPC_ADD_SERVER_REQUEST + ? rpc->add_server_request.address + : NULL); + if (address) { + char *new_nickname = raft_address_to_nickname(address, &conn->sid); + if (strcmp(conn->nickname, new_nickname)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(50, 50); + VLOG_INFO_RL(&rl, "%s: learned remote address %s", + jsonrpc_session_get_name(conn->js), address); + + free(conn->nickname); + conn->nickname = new_nickname; + } else { + free(new_nickname); + } + } + + return true; +} + +static const char * +raft_get_nickname(const struct raft *raft, const struct uuid *sid, + char buf[SID_LEN + 1], size_t bufsize) +{ + if (uuid_equals(sid, &raft->sid)) { + return raft->local_nickname; + } + + const char *s = raft_servers_get_nickname__(&raft->servers, sid); + if (s) { + return s; + } + + return raft_servers_get_nickname(&raft->add_servers, sid, buf, bufsize); +} + +static void +log_rpc(const union raft_rpc *rpc, + const char *direction, const struct raft_conn *conn) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600); + if (!raft_rpc_is_heartbeat(rpc) && !VLOG_DROP_DBG(&rl)) { + struct ds s = DS_EMPTY_INITIALIZER; + raft_rpc_format(rpc, &s); + VLOG_DBG("%s%s %s", direction, conn->nickname, ds_cstr(&s)); + ds_destroy(&s); + } +} + +static void +raft_send_add_server_request(struct raft *raft, struct raft_conn *conn) +{ + union raft_rpc rq = { + .add_server_request = { + .common = { + .type = RAFT_RPC_ADD_SERVER_REQUEST, + .sid = UUID_ZERO, + .comment = NULL, + }, + .address = raft->local_address, + }, + }; + raft_send__(raft, &rq, conn); +} + +static void +raft_conn_run(struct raft *raft, struct raft_conn *conn) +{ + jsonrpc_session_run(conn->js); + + unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js); + bool just_connected = (new_seqno != conn->js_seqno + && jsonrpc_session_is_connected(conn->js)); + conn->js_seqno = new_seqno; + if (just_connected) { + if (raft->joining) { + raft_send_add_server_request(raft, conn); + } else if (raft->leaving) { + union raft_rpc rq = { + .remove_server_request = { + .common = { + .type = RAFT_RPC_REMOVE_SERVER_REQUEST, + .sid = conn->sid, + }, + .sid = raft->sid, + }, + }; + raft_send__(raft, &rq, conn); + } else { + union raft_rpc rq = (union raft_rpc) { + .hello_request = { + .common = { + .type = RAFT_RPC_HELLO_REQUEST, + .sid = conn->sid, + }, + .address = raft->local_address, + }, + }; + raft_send__(raft, &rq, conn); + } + } + + for (size_t i = 0; i < 50; i++) { + union raft_rpc rpc; + if (!raft_conn_receive(raft, conn, &rpc)) { + break; + } + + log_rpc(&rpc, "<--", conn); + raft_handle_rpc(raft, &rpc); + raft_rpc_uninit(&rpc); + } +} + +static void +raft_waiter_complete_rpc(struct raft *raft, const union raft_rpc *rpc) +{ + uint64_t term = raft_rpc_get_term(rpc); + if (term && term < raft->term) { + /* Drop the message because it's for an expired term. */ + return; + } + + if (!raft_is_rpc_synced(raft, rpc)) { + /* This is a bug. A reply message is deferred because some state in + * the message, such as a term or index, has not been committed to + * disk, and they should only be completed when that commit is done. + * But this message is being completed before the commit is finished. + * Complain, and hope that someone reports the bug. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + if (VLOG_DROP_ERR(&rl)) { + return; + } + + struct ds s = DS_EMPTY_INITIALIZER; + + if (term > raft->synced_term) { + ds_put_format(&s, " because message term %"PRIu64" is " + "past synced term %"PRIu64, + term, raft->synced_term); + } + + uint64_t index = raft_rpc_get_min_sync_index(rpc); + if (index > raft->log_synced) { + ds_put_format(&s, " %s message index %"PRIu64" is past last " + "synced index %"PRIu64, + s.length ? "and" : "because", + index, raft->log_synced); + } + + const struct uuid *vote = raft_rpc_get_vote(rpc); + if (vote && !uuid_equals(vote, &raft->synced_vote)) { + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + ds_put_format(&s, " %s vote %s differs from synced vote %s", + s.length ? "and" : "because", + raft_get_nickname(raft, vote, buf1, sizeof buf1), + raft_get_nickname(raft, &raft->synced_vote, + buf2, sizeof buf2)); + } + + char buf[SID_LEN + 1]; + ds_put_format(&s, ": %s ", + raft_get_nickname(raft, &rpc->common.sid, + buf, sizeof buf)); + raft_rpc_format(rpc, &s); + VLOG_ERR("internal error: deferred %s message completed " + "but not ready to send%s", + raft_rpc_type_to_string(rpc->type), ds_cstr(&s)); + ds_destroy(&s); + + return; + } + + struct raft_conn *dst = raft_find_conn_by_sid(raft, &rpc->common.sid); + if (dst) { + raft_send__(raft, rpc, dst); + } +} + +static void +raft_waiter_complete(struct raft *raft, struct raft_waiter *w) +{ + switch (w->type) { + case RAFT_W_ENTRY: + if (raft->role == RAFT_LEADER) { + raft_update_our_match_index(raft, w->entry.index); + } + raft->log_synced = w->entry.index; + break; + + case RAFT_W_TERM: + raft->synced_term = w->term.term; + raft->synced_vote = w->term.vote; + break; + + case RAFT_W_RPC: + raft_waiter_complete_rpc(raft, w->rpc); + break; + } +} + +static void +raft_waiter_destroy(struct raft_waiter *w) +{ + if (!w) { + return; + } + + switch (w->type) { + case RAFT_W_ENTRY: + case RAFT_W_TERM: + break; + + case RAFT_W_RPC: + raft_rpc_uninit(w->rpc); + free(w->rpc); + break; + } + free(w); +} + +static void +raft_waiters_run(struct raft *raft) +{ + if (ovs_list_is_empty(&raft->waiters)) { + return; + } + + uint64_t cur = ovsdb_log_commit_progress(raft->log); + struct raft_waiter *w, *next; + LIST_FOR_EACH_SAFE (w, next, list_node, &raft->waiters) { + if (cur < w->commit_ticket) { + break; + } + raft_waiter_complete(raft, w); + ovs_list_remove(&w->list_node); + raft_waiter_destroy(w); + } +} + +static void +raft_waiters_wait(struct raft *raft) +{ + struct raft_waiter *w; + LIST_FOR_EACH (w, list_node, &raft->waiters) { + ovsdb_log_commit_wait(raft->log, w->commit_ticket); + break; + } +} + +static bool OVS_WARN_UNUSED_RESULT +raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote) +{ + struct ovsdb_error *error = raft_write_state(raft->log, term, vote); + if (!raft_handle_write_error(raft, error)) { + return false; + } + + struct raft_waiter *w = raft_waiter_create(raft, RAFT_W_TERM, true); + raft->term = w->term.term = term; + raft->vote = w->term.vote = vote ? *vote : UUID_ZERO; + return true; +} + +static void +raft_accept_vote(struct raft *raft, struct raft_server *s, + const struct uuid *vote) +{ + if (uuid_equals(&s->vote, vote)) { + return; + } + if (!uuid_is_zero(&s->vote)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + VLOG_WARN_RL(&rl, "server %s changed its vote from %s to %s", + s->nickname, + raft_get_nickname(raft, &s->vote, buf1, sizeof buf1), + raft_get_nickname(raft, vote, buf2, sizeof buf2)); + } + s->vote = *vote; + if (uuid_equals(vote, &raft->sid) + && ++raft->n_votes > hmap_count(&raft->servers) / 2) { + raft_become_leader(raft); + } +} + +static void +raft_start_election(struct raft *raft, bool leadership_transfer) +{ + if (raft->leaving) { + return; + } + + struct raft_server *me = raft_find_server(raft, &raft->sid); + if (!me) { + return; + } + + if (!raft_set_term(raft, raft->term + 1, &raft->sid)) { + return; + } + + raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP); + + ovs_assert(raft->role != RAFT_LEADER); + ovs_assert(hmap_is_empty(&raft->commands)); + raft->role = RAFT_CANDIDATE; + + raft->n_votes = 0; + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + if (!VLOG_DROP_INFO(&rl)) { + long long int now = time_msec(); + if (now >= raft->election_timeout) { + VLOG_INFO("term %"PRIu64": %lld ms timeout expired, " + "starting election", + raft->term, now - raft->election_base); + } else { + VLOG_INFO("term %"PRIu64": starting election", raft->term); + } + } + raft_reset_timer(raft); + + struct raft_server *peer; + HMAP_FOR_EACH (peer, hmap_node, &raft->servers) { + peer->vote = UUID_ZERO; + if (uuid_equals(&raft->sid, &peer->sid)) { + continue; + } + + union raft_rpc rq = { + .vote_request = { + .common = { + .type = RAFT_RPC_VOTE_REQUEST, + .sid = peer->sid, + }, + .term = raft->term, + .last_log_index = raft->log_end - 1, + .last_log_term = ( + raft->log_end > raft->log_start + ? raft->entries[raft->log_end - raft->log_start - 1].term + : raft->snap.term), + .leadership_transfer = leadership_transfer, + }, + }; + raft_send(raft, &rq); + } + + /* Vote for ourselves. */ + raft_accept_vote(raft, me, &raft->sid); +} + +static void +raft_open_conn(struct raft *raft, const char *address, const struct uuid *sid) +{ + if (strcmp(address, raft->local_address) + && !raft_find_conn_by_address(raft, address)) { + raft_add_conn(raft, jsonrpc_session_open(address, true), sid, false); + } +} + +/* Returns true if 'conn' should stay open, 'conn' if it should be closed. */ +static bool +raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn) +{ + /* Close the connection if it's actually dead. If necessary, we'll + * initiate a new session later. */ + if (!jsonrpc_session_is_alive(conn->js)) { + return false; + } + + /* Keep incoming sessions. We trust the originator to decide to drop + * it. */ + if (conn->incoming) { + return true; + } + + /* If we are joining the cluster, keep sessions to the remote addresses + * that are supposed to be part of the cluster we're joining. */ + if (raft->joining && sset_contains(&raft->remote_addresses, + jsonrpc_session_get_name(conn->js))) { + return true; + } + + /* We have joined the cluster. If we did that "recently", then there is a + * chance that we do not have the most recent server configuration log + * entry. If so, it's a waste to disconnect from the servers that were in + * remote_addresses and that will probably appear in the configuration, + * just to reconnect to them a moment later when we do get the + * configuration update. If we are not ourselves in the configuration, + * then we know that there must be a new configuration coming up, so in + * that case keep the connection. */ + if (!raft_find_server(raft, &raft->sid)) { + return true; + } + + /* Keep the connection only if the server is part of the configuration. */ + return raft_find_server(raft, &conn->sid); +} + +/* Allows 'raft' to maintain the distributed log. Call this function as part + * of the process's main loop. */ +void +raft_run(struct raft *raft) +{ + if (raft->left || raft->failed) { + return; + } + + raft_waiters_run(raft); + + if (!raft->listener && time_msec() >= raft->listen_backoff) { + char *paddr = raft_make_address_passive(raft->local_address); + int error = pstream_open(paddr, &raft->listener, DSCP_DEFAULT); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "%s: listen failed (%s)", + paddr, ovs_strerror(error)); + raft->listen_backoff = time_msec() + 1000; + } + free(paddr); + } + + if (raft->listener) { + struct stream *stream; + int error = pstream_accept(raft->listener, &stream); + if (!error) { + raft_add_conn(raft, jsonrpc_session_open_unreliably( + jsonrpc_open(stream), DSCP_DEFAULT), NULL, + true); + } else if (error != EAGAIN) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + VLOG_WARN_RL(&rl, "%s: accept failed: %s", + pstream_get_name(raft->listener), + ovs_strerror(error)); + } + } + + /* Run RPCs for all open sessions. */ + struct raft_conn *conn; + LIST_FOR_EACH (conn, list_node, &raft->conns) { + raft_conn_run(raft, conn); + } + + /* Close unneeded sessions. */ + struct raft_conn *next; + LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) { + if (!raft_conn_should_stay_open(raft, conn)) { + jsonrpc_session_close(conn->js); + ovs_list_remove(&conn->list_node); + free(conn); + } + } + + /* Open needed sessions. */ + struct raft_server *server; + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { + raft_open_conn(raft, server->address, &server->sid); + } + if (raft->joining) { + const char *address; + SSET_FOR_EACH (address, &raft->remote_addresses) { + raft_open_conn(raft, address, NULL); + } + } + + if (!raft->joining && time_msec() >= raft->election_timeout) { + raft_start_election(raft, false); + } + + if (raft->leaving && time_msec() >= raft->leave_timeout) { + raft_send_remove_server_requests(raft); + } + + if (raft->joining && time_msec() >= raft->join_timeout) { + raft->join_timeout = time_msec() + 1000; + struct raft_conn *conn; + LIST_FOR_EACH (conn, list_node, &raft->conns) { + raft_send_add_server_request(raft, conn); + } + } + + if (time_msec() >= raft->ping_timeout) { + if (raft->role == RAFT_LEADER) { + raft_send_heartbeats(raft); + } else { + long long int now = time_msec(); + struct raft_command *cmd, *next; + HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) { + if (cmd->timestamp + && now - cmd->timestamp > ELECTION_BASE_MSEC) { + raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT); + } + } + } + raft->ping_timeout = time_msec() + PING_TIME_MSEC; + } + + /* Do this only at the end; if we did it as soon as we set raft->left or + * raft->failed in handling the RemoveServerReply, then it could easily + * cause references to freed memory in RPC sessions, etc. */ + if (raft->left || raft->failed) { + raft_close__(raft); + } +} + +static void +raft_wait_session(struct jsonrpc_session *js) +{ + if (js) { + jsonrpc_session_wait(js); + jsonrpc_session_recv_wait(js); + } +} + +/* Causes the next call to poll_block() to wake up when 'raft' needs to do + * something. */ +void +raft_wait(struct raft *raft) +{ + if (raft->left || raft->failed) { + return; + } + + raft_waiters_wait(raft); + + if (raft->listener) { + pstream_wait(raft->listener); + } else { + poll_timer_wait_until(raft->listen_backoff); + } + + struct raft_conn *conn; + LIST_FOR_EACH (conn, list_node, &raft->conns) { + raft_wait_session(conn->js); + } + + if (!raft->joining) { + poll_timer_wait_until(raft->election_timeout); + } else { + poll_timer_wait_until(raft->join_timeout); + } + if (raft->leaving) { + poll_timer_wait_until(raft->leave_timeout); + } + if (raft->role == RAFT_LEADER || !hmap_is_empty(&raft->commands)) { + poll_timer_wait_until(raft->ping_timeout); + } +} + +static struct raft_waiter * +raft_waiter_create(struct raft *raft, enum raft_waiter_type type, + bool start_commit) +{ + struct raft_waiter *w = xzalloc(sizeof *w); + ovs_list_push_back(&raft->waiters, &w->list_node); + w->commit_ticket = start_commit ? ovsdb_log_commit_start(raft->log) : 0; + w->type = type; + return w; +} + +/* Returns a human-readable representation of 'status' (or NULL if 'status' is + * invalid). */ +const char * +raft_command_status_to_string(enum raft_command_status status) +{ + switch (status) { + case RAFT_CMD_INCOMPLETE: + return "operation still in progress"; + case RAFT_CMD_SUCCESS: + return "success"; + case RAFT_CMD_NOT_LEADER: + return "not leader"; + case RAFT_CMD_BAD_PREREQ: + return "prerequisite check failed"; + case RAFT_CMD_LOST_LEADERSHIP: + return "lost leadership"; + case RAFT_CMD_SHUTDOWN: + return "server shutdown"; + case RAFT_CMD_IO_ERROR: + return "I/O error"; + case RAFT_CMD_TIMEOUT: + return "timeout"; + default: + return NULL; + } +} + +/* Converts human-readable status in 's' into status code in '*statusp'. + * Returns true if successful, false if 's' is unknown. */ +bool +raft_command_status_from_string(const char *s, + enum raft_command_status *statusp) +{ + for (enum raft_command_status status = 0; ; status++) { + const char *s2 = raft_command_status_to_string(status); + if (!s2) { + *statusp = 0; + return false; + } else if (!strcmp(s, s2)) { + *statusp = status; + return true; + } + } +} + +static const struct uuid * +raft_get_eid(const struct raft *raft, uint64_t index) +{ + for (; index >= raft->log_start; index--) { + const struct raft_entry *e = raft_get_entry(raft, index); + if (e->data) { + return &e->eid; + } + } + return &raft->snap.eid; +} + +static const struct uuid * +raft_current_eid(const struct raft *raft) +{ + return raft_get_eid(raft, raft->log_end - 1); +} + +static struct raft_command * +raft_command_create_completed(enum raft_command_status status) +{ + ovs_assert(status != RAFT_CMD_INCOMPLETE); + + struct raft_command *cmd = xzalloc(sizeof *cmd); + cmd->n_refs = 1; + cmd->status = status; + return cmd; +} + +static struct raft_command * +raft_command_create_incomplete(struct raft *raft, uint64_t index) +{ + struct raft_command *cmd = xzalloc(sizeof *cmd); + cmd->n_refs = 2; /* One for client, one for raft->commands. */ + cmd->index = index; + cmd->status = RAFT_CMD_INCOMPLETE; + hmap_insert(&raft->commands, &cmd->hmap_node, cmd->index); + return cmd; +} + +static struct raft_command * OVS_WARN_UNUSED_RESULT +raft_command_initiate(struct raft *raft, + const struct json *data, const struct json *servers, + const struct uuid *eid) +{ + /* Write to local log. */ + uint64_t index = raft->log_end; + if (!raft_handle_write_error( + raft, raft_write_entry( + raft, raft->term, json_nullable_clone(data), eid, + json_nullable_clone(servers)))) { + return raft_command_create_completed(RAFT_CMD_IO_ERROR); + } + + struct raft_command *cmd = raft_command_create_incomplete(raft, index); + if (eid) { + cmd->eid = *eid; + } + + raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index = cmd->index; + + /* Write to remote logs. */ + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (!uuid_equals(&s->sid, &raft->sid) && s->next_index == index) { + raft_send_append_request(raft, s, 1, "execute command"); + s->next_index++; + } + } + + return cmd; +} + +static struct raft_command * OVS_WARN_UNUSED_RESULT +raft_command_execute__(struct raft *raft, + const struct json *data, const struct json *servers, + const struct uuid *prereq, struct uuid *result) +{ + if (raft->joining || raft->leaving || raft->left || raft->failed) { + return raft_command_create_completed(RAFT_CMD_SHUTDOWN); + } + + if (raft->role != RAFT_LEADER) { + /* Consider proxying the command to the leader. We can only do that if + * we know the leader and the command does not change the set of + * servers. We do not proxy commands without prerequisites, even + * though we could, because in an OVSDB context a log entry doesn't + * make sense without context. */ + if (servers || !data + || raft->role != RAFT_FOLLOWER || uuid_is_zero(&raft->leader_sid) + || !prereq) { + return raft_command_create_completed(RAFT_CMD_NOT_LEADER); + } + } + + struct uuid eid = data ? uuid_random() : UUID_ZERO; + if (result) { + *result = eid; + } + + if (raft->role != RAFT_LEADER) { + const union raft_rpc rpc = { + .execute_command_request = { + .common = { + .type = RAFT_RPC_EXECUTE_COMMAND_REQUEST, + .sid = raft->leader_sid, + }, + .data = CONST_CAST(struct json *, data), + .prereq = *prereq, + .result = eid, + } + }; + if (!raft_send(raft, &rpc)) { + /* Couldn't send command, so it definitely failed. */ + return raft_command_create_completed(RAFT_CMD_NOT_LEADER); + } + + struct raft_command *cmd = raft_command_create_incomplete(raft, 0); + cmd->timestamp = time_msec(); + cmd->eid = eid; + return cmd; + } + + const struct uuid *current_eid = raft_current_eid(raft); + if (prereq && !uuid_equals(prereq, current_eid)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match " + "prerequisite "UUID_FMT, + UUID_ARGS(current_eid), UUID_ARGS(prereq)); + return raft_command_create_completed(RAFT_CMD_BAD_PREREQ); + } + + return raft_command_initiate(raft, data, servers, &eid); +} + +/* Initiates appending a log entry to 'raft'. The log entry consists of 'data' + * and, if 'prereq' is nonnull, it is only added to the log if the previous + * entry in the log has entry ID 'prereq'. If 'result' is nonnull, it is + * populated with the entry ID for the new log entry. + * + * Returns a "struct raft_command" that may be used to track progress adding + * the log entry. The caller must eventually free the returned structure, with + * raft_command_unref(). */ +struct raft_command * OVS_WARN_UNUSED_RESULT +raft_command_execute(struct raft *raft, const struct json *data, + const struct uuid *prereq, struct uuid *result) +{ + return raft_command_execute__(raft, data, NULL, prereq, result); +} + +/* Returns the status of 'cmd'. */ +enum raft_command_status +raft_command_get_status(const struct raft_command *cmd) +{ + ovs_assert(cmd->n_refs > 0); + return cmd->status; +} + +/* Returns the index of the log entry at which 'cmd' was committed. + * + * This function works only with successful commands. */ +uint64_t +raft_command_get_commit_index(const struct raft_command *cmd) +{ + ovs_assert(cmd->n_refs > 0); + ovs_assert(cmd->status == RAFT_CMD_SUCCESS); + return cmd->index; +} + +/* Frees 'cmd'. */ +void +raft_command_unref(struct raft_command *cmd) +{ + if (cmd) { + ovs_assert(cmd->n_refs > 0); + if (!--cmd->n_refs) { + free(cmd); + } + } +} + +/* Causes poll_block() to wake up when 'cmd' has status to report. */ +void +raft_command_wait(const struct raft_command *cmd) +{ + if (cmd->status != RAFT_CMD_INCOMPLETE) { + poll_immediate_wake(); + } +} + +static void +raft_command_complete(struct raft *raft, + struct raft_command *cmd, + enum raft_command_status status) +{ + if (!uuid_is_zero(&cmd->sid)) { + uint64_t commit_index = status == RAFT_CMD_SUCCESS ? cmd->index : 0; + raft_send_execute_command_reply(raft, &cmd->sid, &cmd->eid, status, + commit_index); + } + + ovs_assert(cmd->status == RAFT_CMD_INCOMPLETE); + ovs_assert(cmd->n_refs > 0); + hmap_remove(&raft->commands, &cmd->hmap_node); + cmd->status = status; + raft_command_unref(cmd); +} + +static void +raft_complete_all_commands(struct raft *raft, enum raft_command_status status) +{ + struct raft_command *cmd, *next; + HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) { + raft_command_complete(raft, cmd, status); + } +} + +static struct raft_command * +raft_find_command_by_index(struct raft *raft, uint64_t index) +{ + struct raft_command *cmd; + + HMAP_FOR_EACH_IN_BUCKET (cmd, hmap_node, index, &raft->commands) { + if (cmd->index == index) { + return cmd; + } + } + return NULL; +} + +static struct raft_command * +raft_find_command_by_eid(struct raft *raft, const struct uuid *eid) +{ + struct raft_command *cmd; + + HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) { + if (uuid_equals(&cmd->eid, eid)) { + return cmd; + } + } + return NULL; +} + +#define RAFT_RPC(ENUM, NAME) \ + static void raft_handle_##NAME(struct raft *, const struct raft_##NAME *); +RAFT_RPC_TYPES +#undef RAFT_RPC + +static void +raft_handle_hello_request(struct raft *raft OVS_UNUSED, + const struct raft_hello_request *hello OVS_UNUSED) +{ +} + +/* 'sid' is the server being added or removed. */ +static void +raft_send_add_server_reply__(struct raft *raft, const struct uuid *sid, + const char *address, + bool success, const char *comment) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + if (!VLOG_DROP_INFO(&rl)) { + struct ds s = DS_EMPTY_INITIALIZER; + char buf[SID_LEN + 1]; + ds_put_format(&s, "adding %s ("SID_FMT" at %s) " + "to cluster "CID_FMT" %s", + raft_get_nickname(raft, sid, buf, sizeof buf), + SID_ARGS(sid), address, CID_ARGS(&raft->cid), + success ? "succeeded" : "failed"); + if (comment) { + ds_put_format(&s, " (%s)", comment); + } + VLOG_INFO("%s", ds_cstr(&s)); + ds_destroy(&s); + } + + union raft_rpc rpy = { + .add_server_reply = { + .common = { + .type = RAFT_RPC_ADD_SERVER_REPLY, + .sid = *sid, + .comment = CONST_CAST(char *, comment), + }, + .success = success, + } + }; + + struct sset *remote_addresses = &rpy.add_server_reply.remote_addresses; + sset_init(remote_addresses); + if (!raft->joining) { + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (!uuid_equals(&s->sid, &raft->sid)) { + sset_add(remote_addresses, s->address); + } + } + } + + raft_send(raft, &rpy); + + sset_destroy(remote_addresses); +} + +static void +raft_send_remove_server_reply_rpc(struct raft *raft, const struct uuid *sid, + bool success, const char *comment) +{ + const union raft_rpc rpy = { + .remove_server_reply = { + .common = { + .type = RAFT_RPC_REMOVE_SERVER_REPLY, + .sid = *sid, + .comment = CONST_CAST(char *, comment), + }, + .success = success, + } + }; + raft_send(raft, &rpy); +} + +static void +raft_send_remove_server_reply__(struct raft *raft, + const struct uuid *target_sid, + const struct uuid *requester_sid, + struct unixctl_conn *requester_conn, + bool success, const char *comment) +{ + struct ds s = DS_EMPTY_INITIALIZER; + ds_put_format(&s, "request "); + if (!uuid_is_zero(requester_sid)) { + char buf[SID_LEN + 1]; + ds_put_format(&s, "by %s", + raft_get_nickname(raft, requester_sid, buf, sizeof buf)); + } else { + ds_put_cstr(&s, "via unixctl"); + } + ds_put_cstr(&s, " to remove "); + if (!requester_conn && uuid_equals(target_sid, requester_sid)) { + ds_put_cstr(&s, "itself"); + } else { + char buf[SID_LEN + 1]; + ds_put_cstr(&s, raft_get_nickname(raft, target_sid, buf, sizeof buf)); + } + ds_put_format(&s, " from cluster "CID_FMT" %s", + CID_ARGS(&raft->cid), + success ? "succeeded" : "failed"); + if (comment) { + ds_put_format(&s, " (%s)", comment); + } + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + VLOG_INFO_RL(&rl, "%s", ds_cstr(&s)); + + /* Send RemoveServerReply to the requester (which could be a server or a + * unixctl connection. Also always send it to the removed server; this + * allows it to be sure that it's really removed and update its log and + * disconnect permanently. */ + if (!uuid_is_zero(requester_sid)) { + raft_send_remove_server_reply_rpc(raft, requester_sid, + success, comment); + } + if (!uuid_equals(requester_sid, target_sid)) { + raft_send_remove_server_reply_rpc(raft, target_sid, success, comment); + } + if (requester_conn) { + if (success) { + unixctl_command_reply(requester_conn, ds_cstr(&s)); + } else { + unixctl_command_reply_error(requester_conn, ds_cstr(&s)); + } + } + + ds_destroy(&s); +} + +static void +raft_send_add_server_reply(struct raft *raft, + const struct raft_add_server_request *rq, + bool success, const char *comment) +{ + return raft_send_add_server_reply__(raft, &rq->common.sid, rq->address, + success, comment); +} + +static void +raft_send_remove_server_reply(struct raft *raft, + const struct raft_remove_server_request *rq, + bool success, const char *comment) +{ + return raft_send_remove_server_reply__(raft, &rq->sid, &rq->common.sid, + rq->requester_conn, success, + comment); +} + +static void +raft_become_follower(struct raft *raft) +{ + raft->leader_sid = UUID_ZERO; + if (raft->role == RAFT_FOLLOWER) { + return; + } + + raft->role = RAFT_FOLLOWER; + raft_reset_timer(raft); + + /* Notify clients about lost leadership. + * + * We do not reverse our changes to 'raft->servers' because the new + * configuration is already part of the log. Possibly the configuration + * log entry will not be committed, but until we know that we must use the + * new configuration. Our AppendEntries processing will properly update + * the server configuration later, if necessary. */ + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) { + raft_send_add_server_reply__(raft, &s->sid, s->address, false, + RAFT_SERVER_LOST_LEADERSHIP); + } + if (raft->remove_server) { + raft_send_remove_server_reply__(raft, &raft->remove_server->sid, + &raft->remove_server->requester_sid, + raft->remove_server->requester_conn, + false, RAFT_SERVER_LOST_LEADERSHIP); + raft_server_destroy(raft->remove_server); + raft->remove_server = NULL; + } + + raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP); +} + +static void +raft_send_append_request(struct raft *raft, + struct raft_server *peer, unsigned int n, + const char *comment) +{ + ovs_assert(raft->role == RAFT_LEADER); + + const union raft_rpc rq = { + .append_request = { + .common = { + .type = RAFT_RPC_APPEND_REQUEST, + .sid = peer->sid, + .comment = CONST_CAST(char *, comment), + }, + .term = raft->term, + .prev_log_index = peer->next_index - 1, + .prev_log_term = (peer->next_index - 1 >= raft->log_start + ? raft->entries[peer->next_index - 1 + - raft->log_start].term + : raft->snap.term), + .leader_commit = raft->commit_index, + .entries = &raft->entries[peer->next_index - raft->log_start], + .n_entries = n, + }, + }; + raft_send(raft, &rq); +} + +static void +raft_send_heartbeats(struct raft *raft) +{ + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (!uuid_equals(&raft->sid, &s->sid)) { + raft_send_append_request(raft, s, 0, "heartbeat"); + } + } + + /* Send anyone waiting for a command to complete a ping to let them + * know we're still working on it. */ + struct raft_command *cmd; + HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) { + if (!uuid_is_zero(&cmd->sid)) { + raft_send_execute_command_reply(raft, &cmd->sid, + &cmd->eid, + RAFT_CMD_INCOMPLETE, 0); + } + } +} + +static void +raft_server_init_leader(struct raft *raft, struct raft_server *s) +{ + s->next_index = raft->log_end; + s->match_index = 0; + s->phase = RAFT_PHASE_STABLE; +} + +static void +raft_become_leader(struct raft *raft) +{ + raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP); + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_INFO_RL(&rl, "term %"PRIu64": elected leader by %d+ of " + "%"PRIuSIZE" servers", raft->term, + raft->n_votes, hmap_count(&raft->servers)); + + ovs_assert(raft->role != RAFT_LEADER); + raft->role = RAFT_LEADER; + raft->leader_sid = raft->sid; + raft->election_timeout = LLONG_MAX; + raft->ping_timeout = time_msec() + PING_TIME_MSEC; + + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + raft_server_init_leader(raft, s); + } + + raft_update_our_match_index(raft, raft->log_end - 1); + raft_send_heartbeats(raft); + + /* Write the fact that we are leader to the log. This is not used by the + * algorithm (although it could be, for quick restart), but it is used for + * offline analysis to check for conformance with the properties that Raft + * guarantees. */ + struct raft_record r = { + .type = RAFT_REC_LEADER, + .term = raft->term, + .sid = raft->sid, + }; + ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); + + /* Initiate a no-op commit. Otherwise we might never find out what's in + * the log. See section 6.4 item 1: + * + * The Leader Completeness Property guarantees that a leader has all + * committed entries, but at the start of its term, it may not know + * which those are. To find out, it needs to commit an entry from its + * term. Raft handles this by having each leader commit a blank no-op + * entry into the log at the start of its term. As soon as this no-op + * entry is committed, the leader’s commit index will be at least as + * large as any other servers’ during its term. + */ + raft_command_unref(raft_command_execute__(raft, NULL, NULL, NULL, NULL)); +} + +/* Processes term 'term' received as part of RPC 'common'. Returns true if the + * caller should continue processing the RPC, false if the caller should reject + * it due to a stale term. */ +static bool +raft_receive_term__(struct raft *raft, const struct raft_rpc_common *common, + uint64_t term) +{ + /* Section 3.3 says: + * + * Current terms are exchanged whenever servers communicate; if one + * server’s current term is smaller than the other’s, then it updates + * its current term to the larger value. If a candidate or leader + * discovers that its term is out of date, it immediately reverts to + * follower state. If a server receives a request with a stale term + * number, it rejects the request. + */ + if (term > raft->term) { + if (!raft_set_term(raft, term, NULL)) { + return false; + } + raft_become_follower(raft); + } else if (term < raft->term) { + char buf[SID_LEN + 1]; + VLOG_INFO("rejecting term %"PRIu64" < current term %"PRIu64" received " + "in %s message from server %s", + term, raft->term, + raft_rpc_type_to_string(common->type), + raft_get_nickname(raft, &common->sid, buf, sizeof buf)); + return false; + } + return true; +} + +static void +raft_get_servers_from_log(struct raft *raft, enum vlog_level level) +{ + const struct json *servers_json = raft->snap.servers; + for (uint64_t index = raft->log_end - 1; index >= raft->log_start; + index--) { + struct raft_entry *e = &raft->entries[index - raft->log_start]; + if (e->servers) { + servers_json = e->servers; + break; + } + } + + struct hmap servers; + struct ovsdb_error *error = raft_servers_from_json(servers_json, &servers); + ovs_assert(!error); + raft_set_servers(raft, &servers, level); + raft_servers_destroy(&servers); +} + +/* Truncates the log, so that raft->log_end becomes 'new_end'. + * + * Doesn't write anything to disk. In theory, we could truncate the on-disk + * log file, but we don't have the right information to know how long it should + * be. What we actually do is to append entries for older indexes to the + * on-disk log; when we re-read it later, these entries truncate the log. + * + * Returns true if any of the removed log entries were server configuration + * entries, false otherwise. */ +static bool +raft_truncate(struct raft *raft, uint64_t new_end) +{ + ovs_assert(new_end >= raft->log_start); + if (raft->log_end > new_end) { + char buf[SID_LEN + 1]; + VLOG_INFO("%s truncating %"PRIu64 " entries from end of log", + raft_get_nickname(raft, &raft->sid, buf, sizeof buf), + raft->log_end - new_end); + } + + bool servers_changed = false; + while (raft->log_end > new_end) { + struct raft_entry *entry = &raft->entries[--raft->log_end + - raft->log_start]; + if (entry->servers) { + servers_changed = true; + } + raft_entry_uninit(entry); + } + return servers_changed; +} + +static const struct json * +raft_peek_next_entry(struct raft *raft, struct uuid *eid) +{ + /* Invariant: log_start - 2 <= last_applied <= commit_index < log_end. */ + ovs_assert(raft->log_start <= raft->last_applied + 2); + ovs_assert(raft->last_applied <= raft->commit_index); + ovs_assert(raft->commit_index < raft->log_end); + + if (raft->joining || raft->failed) { /* XXX needed? */ + return NULL; + } + + if (raft->log_start == raft->last_applied + 2) { + *eid = raft->snap.eid; + return raft->snap.data; + } + + while (raft->last_applied < raft->commit_index) { + const struct raft_entry *e = raft_get_entry(raft, + raft->last_applied + 1); + if (e->data) { + *eid = e->eid; + return e->data; + } + raft->last_applied++; + } + return NULL; +} + +static const struct json * +raft_get_next_entry(struct raft *raft, struct uuid *eid) +{ + const struct json *data = raft_peek_next_entry(raft, eid); + if (data) { + raft->last_applied++; + } + return data; +} + +static void +raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) +{ + if (new_commit_index <= raft->commit_index) { + return; + } + + if (raft->role == RAFT_LEADER) { + while (raft->commit_index < new_commit_index) { + uint64_t index = ++raft->commit_index; + const struct raft_entry *e = raft_get_entry(raft, index); + if (e->servers) { + raft_run_reconfigure(raft); + } + if (e->data) { + struct raft_command *cmd + = raft_find_command_by_index(raft, index); + if (cmd) { + raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS); + } + } + } + } else { + raft->commit_index = new_commit_index; + } + + /* Write the commit index to the log. The next time we restart, this + * allows us to start exporting a reasonably fresh log, instead of a log + * that only contains the snapshot. */ + struct raft_record r = { + .type = RAFT_REC_COMMIT_INDEX, + .commit_index = raft->commit_index, + }; + ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); +} + +/* This doesn't use rq->entries (but it does use rq->n_entries). */ +static void +raft_send_append_reply(struct raft *raft, const struct raft_append_request *rq, + enum raft_append_result result, const char *comment) +{ + /* Figure 3.1: "If leaderCommit > commitIndex, set commitIndex = + * min(leaderCommit, index of last new entry)" */ + if (result == RAFT_APPEND_OK && rq->leader_commit > raft->commit_index) { + raft_update_commit_index( + raft, MIN(rq->leader_commit, rq->prev_log_index + rq->n_entries)); + } + + /* Send reply. */ + union raft_rpc reply = { + .append_reply = { + .common = { + .type = RAFT_RPC_APPEND_REPLY, + .sid = rq->common.sid, + .comment = CONST_CAST(char *, comment), + }, + .term = raft->term, + .log_end = raft->log_end, + .prev_log_index = rq->prev_log_index, + .prev_log_term = rq->prev_log_term, + .n_entries = rq->n_entries, + .result = result, + } + }; + raft_send(raft, &reply); +} + +/* If 'prev_log_index' exists in 'raft''s log, term 'prev_log_term', returns + * NULL. Otherwise, returns an explanation for the mismatch. */ +static const char * +match_index_and_term(const struct raft *raft, + uint64_t prev_log_index, uint64_t prev_log_term) +{ + if (prev_log_index < raft->log_start - 1) { + return "mismatch before start of log"; + } else if (prev_log_index == raft->log_start - 1) { + if (prev_log_term != raft->snap.term) { + return "prev_term mismatch"; + } + } else if (prev_log_index < raft->log_end) { + if (raft->entries[prev_log_index - raft->log_start].term + != prev_log_term) { + return "term mismatch"; + } + } else { + /* prev_log_index >= raft->log_end */ + return "mismatch past end of log"; + } + return NULL; +} + +/* Returns NULL on success, RAFT_IN_PROGRESS for an operation in progress, + * otherwise a brief comment explaining failure. */ +static void +raft_handle_append_entries(struct raft *raft, + const struct raft_append_request *rq, + uint64_t prev_log_index, uint64_t prev_log_term, + const struct raft_entry *entries, + unsigned int n_entries) +{ + /* Section 3.5: "When sending an AppendEntries RPC, the leader includes + * the index and term of the entry in its log that immediately precedes + * the new entries. If the follower does not find an entry in its log + * with the same index and term, then it refuses the new entries." */ + const char *mismatch = match_index_and_term(raft, prev_log_index, + prev_log_term); + if (mismatch) { + VLOG_INFO("rejecting append_request because previous entry " + "%"PRIu64",%"PRIu64" not in local log (%s)", + prev_log_term, prev_log_index, mismatch); + raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY, mismatch); + return; + } + + /* Figure 3.1: "If an existing entry conflicts with a new one (same + * index but different terms), delete the existing entry and all that + * follow it." */ + unsigned int i; + bool servers_changed = false; + for (i = 0; ; i++) { + if (i >= n_entries) { + /* No change. */ + if (rq->common.comment + && !strcmp(rq->common.comment, "heartbeat")) { + raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "heartbeat"); + } else { + raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change"); + } + return; + } + + uint64_t log_index = (prev_log_index + 1) + i; + if (log_index >= raft->log_end) { + break; + } + if (raft->entries[log_index - raft->log_start].term + != entries[i].term) { + if (raft_truncate(raft, log_index)) { + servers_changed = true; + } + break; + } + } + + /* Figure 3.1: "Append any entries not already in the log." */ + struct ovsdb_error *error = NULL; + bool any_written = false; + for (; i < n_entries; i++) { + const struct raft_entry *e = &entries[i]; + error = raft_write_entry(raft, e->term, + json_nullable_clone(e->data), &e->eid, + json_nullable_clone(e->servers)); + if (error) { + break; + } + any_written = true; + if (e->servers) { + servers_changed = true; + } + } + + if (any_written) { + raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index + = raft->log_end - 1; + } + if (servers_changed) { + raft_get_servers_from_log(raft, VLL_INFO); + } + + if (error) { + char *s = ovsdb_error_to_string_free(error); + VLOG_ERR("%s", s); + free(s); + raft_send_append_reply(raft, rq, RAFT_APPEND_IO_ERROR, "I/O error"); + return; + } + + raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "log updated"); +} + +static bool +raft_update_leader(struct raft *raft, const struct uuid *sid) +{ + if (raft->role == RAFT_LEADER && !uuid_equals(sid, &raft->sid)) { + char buf[SID_LEN + 1]; + VLOG_ERR("this server is leader but server %s claims to be", + raft_get_nickname(raft, sid, buf, sizeof buf)); + return false; + } else if (!uuid_equals(sid, &raft->leader_sid)) { + if (!uuid_is_zero(&raft->leader_sid)) { + char buf1[SID_LEN + 1]; + char buf2[SID_LEN + 1]; + VLOG_ERR("leader for term %"PRIu64" changed from %s to %s", + raft->term, + raft_get_nickname(raft, &raft->leader_sid, + buf1, sizeof buf1), + raft_get_nickname(raft, sid, buf2, sizeof buf2)); + } else { + char buf[SID_LEN + 1]; + VLOG_INFO("server %s is leader for term %"PRIu64, + raft_get_nickname(raft, sid, buf, sizeof buf), + raft->term); + } + raft->leader_sid = *sid; + + /* Record the leader to the log. This is not used by the algorithm + * (although it could be, for quick restart), but it is used for + * offline analysis to check for conformance with the properties + * that Raft guarantees. */ + struct raft_record r = { + .type = RAFT_REC_LEADER, + .term = raft->term, + .sid = *sid, + }; + ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); + } + return true; +} + +static void +raft_handle_append_request__(struct raft *raft, + const struct raft_append_request *rq) +{ + /* We do not check whether the server that sent the request is part of the + * cluster. As section 4.1 says, "A server accepts AppendEntries requests + * from a leader that is not part of the server’s latest configuration. + * Otherwise, a new server could never be added to the cluster (it would + * never accept any log entries preceding the configuration entry that adds + * the server)." */ + if (!raft_update_leader(raft, &rq->common.sid)) { + raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY, + "usurped leadership"); + return; + } + raft_reset_timer(raft); + + /* First check for the common case, where the AppendEntries request is + * entirely for indexes covered by 'log_start' ... 'log_end - 1', something + * like this: + * + * rq->prev_log_index + * | first_entry_index + * | | nth_entry_index + * | | | + * v v v + * +---+---+---+---+ + * T | T | T | T | T | + * +---+-------+---+ + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + * */ + uint64_t first_entry_index = rq->prev_log_index + 1; + uint64_t nth_entry_index = rq->prev_log_index + rq->n_entries; + if (OVS_LIKELY(first_entry_index >= raft->log_start)) { + raft_handle_append_entries(raft, rq, + rq->prev_log_index, rq->prev_log_term, + rq->entries, rq->n_entries); + return; + } + + /* Now a series of checks for odd cases, where the AppendEntries request + * extends earlier than the beginning of our log, into the log entries + * discarded by the most recent snapshot. */ + + /* + * Handle the case where the indexes covered by rq->entries[] are entirely + * disjoint with 'log_start - 1' ... 'log_end - 1', as shown below. So, + * everything in the AppendEntries request must already have been + * committed, and we might as well return true. + * + * rq->prev_log_index + * | first_entry_index + * | | nth_entry_index + * | | | + * v v v + * +---+---+---+---+ + * T | T | T | T | T | + * +---+-------+---+ + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + */ + if (nth_entry_index < raft->log_start - 1) { + raft_send_append_reply(raft, rq, RAFT_APPEND_OK, + "append before log start"); + return; + } + + /* + * Handle the case where the last entry in rq->entries[] has the same index + * as 'log_start - 1', so we can compare their terms: + * + * rq->prev_log_index + * | first_entry_index + * | | nth_entry_index + * | | | + * v v v + * +---+---+---+---+ + * T | T | T | T | T | + * +---+-------+---+ + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + * + * There's actually a sub-case where rq->n_entries == 0, in which we + * compare rq->prev_term: + * + * rq->prev_log_index + * | + * | + * | + * v + * T + * + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + */ + if (nth_entry_index == raft->log_start - 1) { + if (rq->n_entries + ? raft->snap.term == rq->entries[rq->n_entries - 1].term + : raft->snap.term == rq->prev_log_term) { + raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change"); + } else { + raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY, + "term mismatch"); + } + return; + } + + /* + * We now know that the data in rq->entries[] overlaps the data in + * raft->entries[], as shown below, with some positive 'ofs': + * + * rq->prev_log_index + * | first_entry_index + * | | nth_entry_index + * | | | + * v v v + * +---+---+---+---+---+ + * T | T | T | T | T | T | + * +---+-------+---+---+ + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + * + * |<-- ofs -->| + * + * We transform this into the following by trimming the first 'ofs' + * elements off of rq->entries[], ending up with the following. Notice how + * we retain the term but not the data for rq->entries[ofs - 1]: + * + * first_entry_index + ofs - 1 + * | first_entry_index + ofs + * | | nth_entry_index + ofs + * | | | + * v v v + * +---+---+ + * T | T | T | + * +---+---+ + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + */ + uint64_t ofs = raft->log_start - first_entry_index; + raft_handle_append_entries(raft, rq, + raft->log_start - 1, rq->entries[ofs - 1].term, + &rq->entries[ofs], rq->n_entries - ofs); +} + +/* Returns true if 'raft' has another log entry or snapshot to read. */ +bool +raft_has_next_entry(const struct raft *raft_) +{ + struct raft *raft = CONST_CAST(struct raft *, raft_); + struct uuid eid; + return raft_peek_next_entry(raft, &eid) != NULL; +} + +/* Returns the next log entry or snapshot from 'raft', or NULL if there are + * none left to read.. Stores the entry ID of the log entry in '*eid'. Stores + * true in '*is_snapshot' if the returned data is a snapshot, false if it is a + * log entry.*/ +const struct json * +raft_next_entry(struct raft *raft, struct uuid *eid, bool *is_snapshot) +{ + const struct json *data = raft_get_next_entry(raft, eid); + *is_snapshot = data == raft->snap.data; + return data; +} + +/* Returns the log index of the last-read snapshot or log entry. */ +uint64_t +raft_get_applied_index(const struct raft *raft) +{ + return raft->last_applied; +} + +/* Returns the log index of the last snapshot or log entry that is available to + * be read. */ +uint64_t +raft_get_commit_index(const struct raft *raft) +{ + return raft->commit_index; +} + +static void +raft_handle_append_request(struct raft *raft, + const struct raft_append_request *rq) +{ + raft_handle_append_request__(raft, rq); +} + +static struct raft_server * +raft_find_peer(struct raft *raft, const struct uuid *uuid) +{ + struct raft_server *s = raft_find_server(raft, uuid); + return s && !uuid_equals(&raft->sid, &s->sid) ? s : NULL; +} + +static struct raft_server * +raft_find_new_server(struct raft *raft, const struct uuid *uuid) +{ + return raft_server_find(&raft->add_servers, uuid); +} + +/* Figure 3.1: "If there exists an N such that N > commitIndex, a + * majority of matchIndex[i] >= N, and log[N].term == currentTerm, set + * commitIndex = N (sections 3.5 and 3.6)." */ +static void +raft_consider_updating_commit_index(struct raft *raft) +{ + /* This loop cannot just bail out when it comes across a log entry that + * does not match the criteria. For example, Figure 3.7(d2) shows a + * case where the log entry for term 2 cannot be committed directly + * (because it is not for the current term) but it can be committed as + * a side effect of commit the entry for term 4 (the current term). + * XXX Is there a more efficient way to do this? */ + ovs_assert(raft->role == RAFT_LEADER); + + uint64_t new_commit_index = raft->commit_index; + for (uint64_t idx = MAX(raft->commit_index + 1, raft->log_start); + idx < raft->log_end; idx++) { + if (raft->entries[idx - raft->log_start].term == raft->term) { + size_t count = 0; + struct raft_server *s2; + HMAP_FOR_EACH (s2, hmap_node, &raft->servers) { + if (s2->match_index >= idx) { + count++; + } + } + if (count > hmap_count(&raft->servers) / 2) { + VLOG_DBG("index %"PRIu64" committed to %"PRIuSIZE" servers, " + "applying", idx, count); + new_commit_index = idx; + } + } + } + raft_update_commit_index(raft, new_commit_index); +} + +static void +raft_update_match_index(struct raft *raft, struct raft_server *s, + uint64_t min_index) +{ + ovs_assert(raft->role == RAFT_LEADER); + if (min_index > s->match_index) { + s->match_index = min_index; + raft_consider_updating_commit_index(raft); + } +} + +static void +raft_update_our_match_index(struct raft *raft, uint64_t min_index) +{ + raft_update_match_index(raft, raft_find_server(raft, &raft->sid), + min_index); +} + +static void +raft_send_install_snapshot_request(struct raft *raft, + const struct raft_server *s, + const char *comment) +{ + union raft_rpc rpc = { + .install_snapshot_request = { + .common = { + .type = RAFT_RPC_INSTALL_SNAPSHOT_REQUEST, + .sid = s->sid, + .comment = CONST_CAST(char *, comment), + }, + .term = raft->term, + .last_index = raft->log_start - 1, + .last_term = raft->snap.term, + .last_servers = raft->snap.servers, + .last_eid = raft->snap.eid, + .data = raft->snap.data, + } + }; + raft_send(raft, &rpc); +} + +static void +raft_handle_append_reply(struct raft *raft, + const struct raft_append_reply *rpy) +{ + if (raft->role != RAFT_LEADER) { + VLOG_INFO("rejected append_reply (not leader)"); + return; + } + + /* Most commonly we'd be getting an AppendEntries reply from a configured + * server (e.g. a peer), but we can also get them from servers in the + * process of being added. */ + struct raft_server *s = raft_find_peer(raft, &rpy->common.sid); + if (!s) { + s = raft_find_new_server(raft, &rpy->common.sid); + if (!s) { + VLOG_INFO("rejected append_reply from unknown server "SID_FMT, + SID_ARGS(&rpy->common.sid)); + return; + } + } + + if (rpy->result == RAFT_APPEND_OK) { + /* Figure 3.1: "If successful, update nextIndex and matchIndex for + * follower (section 3.5)." */ + uint64_t min_index = rpy->prev_log_index + rpy->n_entries + 1; + if (s->next_index < min_index) { + s->next_index = min_index; + } + raft_update_match_index(raft, s, min_index - 1); + } else { + /* Figure 3.1: "If AppendEntries fails because of log inconsistency, + * decrement nextIndex and retry (section 3.5)." + * + * We also implement the optimization suggested in section 4.2.1: + * "Various approaches can make nextIndex converge to its correct value + * more quickly, including those described in Chapter 3. The simplest + * approach to solving this particular problem of adding a new server, + * however, is to have followers return the length of their logs in the + * AppendEntries response; this allows the leader to cap the follower’s + * nextIndex accordingly." */ + s->next_index = (s->next_index > 0 + ? MIN(s->next_index - 1, rpy->log_end) + : 0); + + if (rpy->result == RAFT_APPEND_IO_ERROR) { + /* Append failed but not because of a log inconsistency. Because + * of the I/O error, there's no point in re-sending the append + * immediately. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "%s reported I/O error", s->nickname); + return; + } + } + + /* + * Our behavior here must depend on the value of next_index relative to + * log_start and log_end. There are three cases: + * + * Case 1 | Case 2 | Case 3 + * <---------------->|<------------->|<------------------> + * | | + * + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + */ + if (s->next_index < raft->log_start) { + /* Case 1. */ + raft_send_install_snapshot_request(raft, s, NULL); + } else if (s->next_index < raft->log_end) { + /* Case 2. */ + raft_send_append_request(raft, s, 1, NULL); + } else { + /* Case 3. */ + if (s->phase == RAFT_PHASE_CATCHUP) { + s->phase = RAFT_PHASE_CAUGHT_UP; + raft_run_reconfigure(raft); + } + } +} + +static bool +raft_should_suppress_disruptive_server(struct raft *raft, + const union raft_rpc *rpc) +{ + if (rpc->type != RAFT_RPC_VOTE_REQUEST) { + return false; + } + + /* Section 4.2.3 "Disruptive Servers" says: + * + * ...if a server receives a RequestVote request within the minimum + * election timeout of hearing from a current leader, it does not update + * its term or grant its vote... + * + * ...This change conflicts with the leadership transfer mechanism as + * described in Chapter 3, in which a server legitimately starts an + * election without waiting an election timeout. In that case, + * RequestVote messages should be processed by other servers even when + * they believe a current cluster leader exists. Those RequestVote + * requests can include a special flag to indicate this behavior (“I + * have permission to disrupt the leader--it told me to!”). + * + * This clearly describes how the followers should act, but not the leader. + * We just ignore vote requests that arrive at a current leader. This + * seems to be fairly safe, since a majority other than the current leader + * can still elect a new leader and the first AppendEntries from that new + * leader will depose the current leader. */ + const struct raft_vote_request *rq = raft_vote_request_cast(rpc); + if (rq->leadership_transfer) { + return false; + } + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + long long int now = time_msec(); + switch (raft->role) { + case RAFT_LEADER: + VLOG_WARN_RL(&rl, "ignoring vote request received as leader"); + return true; + + case RAFT_FOLLOWER: + if (now < raft->election_base + ELECTION_BASE_MSEC) { + VLOG_WARN_RL(&rl, "ignoring vote request received after only " + "%lld ms (minimum election time is %d ms)", + now - raft->election_base, ELECTION_BASE_MSEC); + return true; + } + return false; + + case RAFT_CANDIDATE: + return false; + + default: + OVS_NOT_REACHED(); + } +} + +/* Returns true if a reply should be sent. */ +static bool +raft_handle_vote_request__(struct raft *raft, + const struct raft_vote_request *rq) +{ + /* Figure 3.1: "If votedFor is null or candidateId, and candidate's vote is + * at least as up-to-date as receiver's log, grant vote (sections 3.4, + * 3.6)." */ + if (uuid_equals(&raft->vote, &rq->common.sid)) { + /* Already voted for this candidate in this term. Resend vote. */ + return true; + } else if (!uuid_is_zero(&raft->vote)) { + /* Already voted for different candidate in this term. Send a reply + * saying what candidate we did vote for. This isn't a necessary part + * of the Raft protocol but it can make debugging easier. */ + return true; + } + + /* Section 3.6.1: "The RequestVote RPC implements this restriction: the RPC + * includes information about the candidate’s log, and the voter denies its + * vote if its own log is more up-to-date than that of the candidate. Raft + * determines which of two logs is more up-to-date by comparing the index + * and term of the last entries in the logs. If the logs have last entries + * with different terms, then the log with the later term is more + * up-to-date. If the logs end with the same term, then whichever log is + * longer is more up-to-date." */ + uint64_t last_term = (raft->log_end > raft->log_start + ? raft->entries[raft->log_end - 1 + - raft->log_start].term + : raft->snap.term); + if (last_term > rq->last_log_term + || (last_term == rq->last_log_term + && raft->log_end - 1 > rq->last_log_index)) { + /* Our log is more up-to-date than the peer's. Withhold vote. */ + return false; + } + + /* Record a vote for the peer. */ + if (!raft_set_term(raft, raft->term, &rq->common.sid)) { + return false; + } + + raft_reset_timer(raft); + + return true; +} + +static void +raft_send_vote_reply(struct raft *raft, const struct uuid *dst, + const struct uuid *vote) +{ + union raft_rpc rpy = { + .vote_reply = { + .common = { + .type = RAFT_RPC_VOTE_REPLY, + .sid = *dst, + }, + .term = raft->term, + .vote = *vote, + }, + }; + raft_send(raft, &rpy); +} + +static void +raft_handle_vote_request(struct raft *raft, + const struct raft_vote_request *rq) +{ + if (raft_handle_vote_request__(raft, rq)) { + raft_send_vote_reply(raft, &rq->common.sid, &raft->vote); + } +} + +static void +raft_handle_vote_reply(struct raft *raft, + const struct raft_vote_reply *rpy) +{ + if (!raft_receive_term__(raft, &rpy->common, rpy->term)) { + return; + } + + if (raft->role != RAFT_CANDIDATE) { + return; + } + + struct raft_server *s = raft_find_peer(raft, &rpy->common.sid); + if (s) { + raft_accept_vote(raft, s, &rpy->vote); + } +} + +/* Returns true if 'raft''s log contains reconfiguration entries that have not + * yet been committed. */ +static bool +raft_has_uncommitted_configuration(const struct raft *raft) +{ + for (uint64_t i = raft->commit_index + 1; i < raft->log_end; i++) { + ovs_assert(i >= raft->log_start); + const struct raft_entry *e = &raft->entries[i - raft->log_start]; + if (e->servers) { + return true; + } + } + return false; +} + +static void +raft_log_reconfiguration(struct raft *raft) +{ + /* Add the reconfiguration to the log. + * + * We ignore any */ + struct json *servers_json = raft_servers_to_json(&raft->servers); + raft_command_unref(raft_command_execute__( + raft, NULL, servers_json, NULL, NULL)); + json_destroy(servers_json); +} + +static void +raft_run_reconfigure(struct raft *raft) +{ + ovs_assert(raft->role == RAFT_LEADER); + + /* Reconfiguration only progresses when configuration changes commit. */ + if (raft_has_uncommitted_configuration(raft)) { + return; + } + + /* If we were waiting for a configuration change to commit, it's done. */ + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (s->phase == RAFT_PHASE_COMMITTING) { + raft_send_add_server_reply__(raft, &s->sid, s->address, + true, RAFT_SERVER_COMPLETED); + s->phase = RAFT_PHASE_STABLE; + } + } + if (raft->remove_server) { + raft_send_remove_server_reply__(raft, &raft->remove_server->sid, + &raft->remove_server->requester_sid, + raft->remove_server->requester_conn, + true, RAFT_SERVER_COMPLETED); + raft_server_destroy(raft->remove_server); + raft->remove_server = NULL; + } + + /* If a new server is caught up, add it to the configuration. */ + HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) { + if (s->phase == RAFT_PHASE_CAUGHT_UP) { + /* Move 's' from 'raft->add_servers' to 'raft->servers'. */ + hmap_remove(&raft->add_servers, &s->hmap_node); + hmap_insert(&raft->servers, &s->hmap_node, uuid_hash(&s->sid)); + + /* Mark 's' as waiting for commit. */ + s->phase = RAFT_PHASE_COMMITTING; + + raft_log_reconfiguration(raft); + + /* When commit completes we'll transition to RAFT_PHASE_STABLE and + * send a RAFT_SERVER_OK reply. */ + + return; + } + } + + /* Remove a server, if one is scheduled for removal. */ + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (s->phase == RAFT_PHASE_REMOVE) { + hmap_remove(&raft->servers, &s->hmap_node); + raft->remove_server = s; + + raft_log_reconfiguration(raft); + + return; + } + } +} + +static void +raft_handle_add_server_request(struct raft *raft, + const struct raft_add_server_request *rq) +{ + /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */ + if (raft->role != RAFT_LEADER) { + raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER); + return; + } + + /* Check for an existing server. */ + struct raft_server *s = raft_find_server(raft, &rq->common.sid); + if (s) { + /* If the server is scheduled to be removed, cancel it. */ + if (s->phase == RAFT_PHASE_REMOVE) { + s->phase = RAFT_PHASE_STABLE; + raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_CANCELED); + return; + } + + /* If the server is being added, then it's in progress. */ + if (s->phase != RAFT_PHASE_STABLE) { + raft_send_add_server_reply(raft, rq, + false, RAFT_SERVER_IN_PROGRESS); + } + + /* Nothing to do--server is already part of the configuration. */ + raft_send_add_server_reply(raft, rq, + true, RAFT_SERVER_ALREADY_PRESENT); + return; + } + + /* Check for a server being removed. */ + if (raft->remove_server + && uuid_equals(&rq->common.sid, &raft->remove_server->sid)) { + raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING); + return; + } + + /* Check for a server already being added. */ + if (raft_find_new_server(raft, &rq->common.sid)) { + raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_IN_PROGRESS); + return; + } + + /* Add server to 'add_servers'. */ + s = raft_server_add(&raft->add_servers, &rq->common.sid, rq->address); + raft_server_init_leader(raft, s); + s->requester_sid = rq->common.sid; + s->requester_conn = NULL; + s->phase = RAFT_PHASE_CATCHUP; + + /* Start sending the log. If this is the first time we've tried to add + * this server, then this will quickly degenerate into an InstallSnapshot + * followed by a series of AddEntries, but if it's a retry of an earlier + * AddRequest that was interrupted (e.g. by a timeout or a loss of + * leadership) then it will gracefully resume populating the log. + * + * See the last few paragraphs of section 4.2.1 for further insight. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + VLOG_INFO_RL(&rl, + "starting to add server %s ("SID_FMT" at %s) " + "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid), + rq->address, CID_ARGS(&raft->cid)); + raft_send_append_request(raft, s, 0, "initialize new server"); +} + +static void +raft_handle_add_server_reply(struct raft *raft, + const struct raft_add_server_reply *rpy) +{ + if (!raft->joining) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_WARN_RL(&rl, "received add_server_reply even though we're " + "already part of the cluster"); + return; + } + + if (rpy->success) { + raft->joining = false; + + /* It is tempting, at this point, to check that this server is part of + * the current configuration. However, this is not necessarily the + * case, because the log entry that added this server to the cluster + * might have been committed by a majority of the cluster that does not + * include this one. This actually happens in testing. */ + } else { + const char *address; + SSET_FOR_EACH (address, &rpy->remote_addresses) { + if (sset_add(&raft->remote_addresses, address)) { + VLOG_INFO("%s: learned new server address for joining cluster", + address); + } + } + } +} + +/* This is called by raft_unixctl_kick() as well as via RPC. */ +static void +raft_handle_remove_server_request(struct raft *raft, + const struct raft_remove_server_request *rq) +{ + /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */ + if (raft->role != RAFT_LEADER) { + raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER); + return; + } + + /* If the server to remove is currently waiting to be added, cancel it. */ + struct raft_server *target = raft_find_new_server(raft, &rq->sid); + if (target) { + raft_send_add_server_reply__(raft, &target->sid, target->address, + false, RAFT_SERVER_CANCELED); + hmap_remove(&raft->add_servers, &target->hmap_node); + raft_server_destroy(target); + return; + } + + /* If the server isn't configured, report that. */ + target = raft_find_server(raft, &rq->sid); + if (!target) { + raft_send_remove_server_reply(raft, rq, + true, RAFT_SERVER_ALREADY_GONE); + return; + } + + /* Check whether we're waiting for the addition of the server to commit. */ + if (target->phase == RAFT_PHASE_COMMITTING) { + raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING); + return; + } + + /* Check whether the server is already scheduled for removal. */ + if (target->phase == RAFT_PHASE_REMOVE) { + raft_send_remove_server_reply(raft, rq, + false, RAFT_SERVER_IN_PROGRESS); + return; + } + + /* Make sure that if we remove this server then that at least one other + * server will be left. We don't count servers currently being added (in + * 'add_servers') since those could fail. */ + struct raft_server *s; + int n = 0; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + if (s != target && s->phase != RAFT_PHASE_REMOVE) { + n++; + } + } + if (!n) { + raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_EMPTY); + return; + } + + /* Mark the server for removal. */ + target->phase = RAFT_PHASE_REMOVE; + if (rq->requester_conn) { + target->requester_sid = UUID_ZERO; + unixctl_command_reply(rq->requester_conn, "started removal"); + } else { + target->requester_sid = rq->common.sid; + target->requester_conn = NULL; + } + + raft_run_reconfigure(raft); + /* Operation in progress, reply will be sent later. */ +} + +static void +raft_handle_remove_server_reply(struct raft *raft, + const struct raft_remove_server_reply *rpc) +{ + if (rpc->success) { + VLOG_INFO("%04x: finished leaving cluster %04x", + uuid_prefix(&raft->sid, 4), uuid_prefix(&raft->cid, 4)); + + raft_record_note(raft, "left", "this server left the cluster"); + + raft->leaving = false; + raft->left = true; + } +} + +static bool +raft_handle_write_error(struct raft *raft, struct ovsdb_error *error) +{ + if (error && !raft->failed) { + raft->failed = true; + + char *s = ovsdb_error_to_string_free(error); + VLOG_WARN("%s: entering failure mode due to I/O error (%s)", + raft->name, s); + free(s); + } + return !raft->failed; +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_write_snapshot(struct raft *raft, struct ovsdb_log *log, + uint64_t new_log_start, + const struct raft_entry *new_snapshot) +{ + struct raft_header h = { + .sid = raft->sid, + .cid = raft->cid, + .name = raft->name, + .local_address = raft->local_address, + .snap_index = new_log_start - 1, + .snap = *new_snapshot, + }; + struct ovsdb_error *error = ovsdb_log_write_and_free( + log, raft_header_to_json(&h)); + if (error) { + return error; + } + ovsdb_log_mark_base(raft->log); + + /* Write log records. */ + for (uint64_t index = new_log_start; index < raft->log_end; index++) { + const struct raft_entry *e = &raft->entries[index - raft->log_start]; + struct raft_record r = { + .type = RAFT_REC_ENTRY, + .term = e->term, + .entry = { + .index = index, + .data = e->data, + .servers = e->servers, + .eid = e->eid, + }, + }; + error = ovsdb_log_write_and_free(log, raft_record_to_json(&r)); + if (error) { + return error; + } + } + + /* Write term and vote (if any). + * + * The term is redundant if we wrote a log record for that term above. The + * vote, if any, is never redundant. + */ + error = raft_write_state(log, raft->term, &raft->vote); + if (error) { + return error; + } + + /* Write commit_index if it's beyond the new start of the log. */ + if (raft->commit_index >= new_log_start) { + struct raft_record r = { + .type = RAFT_REC_COMMIT_INDEX, + .commit_index = raft->commit_index, + }; + return ovsdb_log_write_and_free(log, raft_record_to_json(&r)); + } + return NULL; +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_save_snapshot(struct raft *raft, + uint64_t new_start, const struct raft_entry *new_snapshot) + +{ + struct ovsdb_log *new_log; + struct ovsdb_error *error; + error = ovsdb_log_replace_start(raft->log, &new_log); + if (error) { + return error; + } + + error = raft_write_snapshot(raft, new_log, new_start, new_snapshot); + if (error) { + ovsdb_log_replace_abort(new_log); + return error; + } + + return ovsdb_log_replace_commit(raft->log, new_log); +} + +static bool +raft_handle_install_snapshot_request__( + struct raft *raft, const struct raft_install_snapshot_request *rq) +{ + raft_reset_timer(raft); + + /* + * Our behavior here depend on new_log_start in the snapshot compared to + * log_start and log_end. There are three cases: + * + * Case 1 | Case 2 | Case 3 + * <---------------->|<------------->|<------------------> + * | | + * + * +---+---+---+---+ + * T | T | T | T | T | + * +---+---+---+---+ + * ^ ^ + * | | + * log_start log_end + */ + uint64_t new_log_start = rq->last_index + 1; + if (new_log_start < raft->log_start) { + /* Case 1: The new snapshot covers less than our current one. Nothing + * to do. */ + return true; + } else if (new_log_start < raft->log_end) { + /* Case 2: The new snapshot starts in the middle of our log. We could + * discard the first 'new_log_start - raft->log_start' entries in the + * log. But there's not much value in that, since snapshotting is + * supposed to be a local decision. Just skip it. */ + return true; + } + + /* Case 3: The new snapshot starts past the end of our current log, so + * discard all of our current log. */ + const struct raft_entry new_snapshot = { + .term = rq->last_term, + .data = rq->data, + .eid = rq->last_eid, + .servers = rq->last_servers, + }; + struct ovsdb_error *error = raft_save_snapshot(raft, new_log_start, + &new_snapshot); + if (error) { + char *error_s = ovsdb_error_to_string(error); + VLOG_WARN("could not save snapshot: %s", error_s); + free(error_s); + return false; + } + + for (size_t i = 0; i < raft->log_end - raft->log_start; i++) { + raft_entry_uninit(&raft->entries[i]); + } + raft->log_start = raft->log_end = new_log_start; + raft->log_synced = raft->log_end - 1; + raft->commit_index = raft->log_start - 1; + if (raft->last_applied < raft->commit_index) { + raft->last_applied = raft->log_start - 2; + } + + raft_entry_uninit(&raft->snap); + raft_entry_clone(&raft->snap, &new_snapshot); + + raft_get_servers_from_log(raft, VLL_INFO); + + return true; +} + +static void +raft_handle_install_snapshot_request( + struct raft *raft, const struct raft_install_snapshot_request *rq) +{ + if (raft_handle_install_snapshot_request__(raft, rq)) { + union raft_rpc rpy = { + .install_snapshot_reply = { + .common = { + .type = RAFT_RPC_INSTALL_SNAPSHOT_REPLY, + .sid = rq->common.sid, + }, + .term = raft->term, + .last_index = rq->last_index, + .last_term = rq->last_term, + }, + }; + raft_send(raft, &rpy); + } +} + +static void +raft_handle_install_snapshot_reply( + struct raft *raft, const struct raft_install_snapshot_reply *rpy) +{ + /* We might get an InstallSnapshot reply from a configured server (e.g. a + * peer) or a server in the process of being added. */ + struct raft_server *s = raft_find_peer(raft, &rpy->common.sid); + if (!s) { + s = raft_find_new_server(raft, &rpy->common.sid); + if (!s) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "cluster "CID_FMT": received %s from " + "unknown server "SID_FMT, CID_ARGS(&raft->cid), + raft_rpc_type_to_string(rpy->common.type), + SID_ARGS(&rpy->common.sid)); + return; + } + } + + if (rpy->last_index != raft->log_start - 1 || + rpy->last_term != raft->snap.term) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "cluster "CID_FMT": server %s installed " + "out-of-date snapshot, starting over", + CID_ARGS(&raft->cid), s->nickname); + raft_send_install_snapshot_request(raft, s, + "installed obsolete snapshot"); + return; + } + + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s " + " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid), + s->nickname, rpy->last_term, rpy->last_index); + s->next_index = raft->log_end; + raft_send_append_request(raft, s, 0, "snapshot installed"); +} + +/* Returns true if 'raft' has grown enough that reducing the log to a snapshot + * would be valuable, false otherwise. When this function returns true, the + * client should consider using raft_store_snapshot() to reduce the log storage + * requirements. */ +bool +raft_grew_lots(const struct raft *raft) +{ + return (!raft->joining + && !raft->leaving + && !raft->left + && !raft->failed + && raft->last_applied - raft->log_start >= 100 + && ovsdb_log_grew_lots(raft->log)); +} + +/* Replaces the log for 'raft', up to the last log entry read, by + * 'new_snapshot_data'. Returns NULL if successful, otherwise an error that + * the caller must eventually free. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +raft_store_snapshot(struct raft *raft, const struct json *new_snapshot_data) +{ + if (raft->joining) { + return ovsdb_error(NULL, + "cannot store a snapshot while joining cluster"); + } else if (raft->leaving) { + return ovsdb_error(NULL, + "cannot store a snapshot while leaving cluster"); + } else if (raft->left) { + return ovsdb_error(NULL, + "cannot store a snapshot after leaving cluster"); + } else if (raft->failed) { + return ovsdb_error(NULL, + "cannot store a snapshot following failure"); + } + + if (raft->last_applied < raft->log_start) { + return ovsdb_error(NULL, "not storing a duplicate snapshot"); + } + + uint64_t new_log_start = raft->last_applied + 1; + const struct raft_entry new_snapshot = { + .term = raft_get_term(raft, new_log_start - 1), + .data = CONST_CAST(struct json *, new_snapshot_data), + .eid = *raft_get_eid(raft, new_log_start - 1), + .servers = raft_servers_for_index(raft, new_log_start - 1), + }; + struct ovsdb_error *error = raft_save_snapshot(raft, new_log_start, + &new_snapshot); + if (error) { + return error; + } + + raft->log_synced = raft->log_end - 1; + raft_entry_uninit(&raft->snap); + raft_entry_clone(&raft->snap, &new_snapshot); + for (size_t i = 0; i < new_log_start - raft->log_start; i++) { + raft_entry_uninit(&raft->entries[i]); + } + memmove(&raft->entries[0], &raft->entries[new_log_start - raft->log_start], + (raft->log_end - new_log_start) * sizeof *raft->entries); + raft->log_start = new_log_start; + return NULL; +} + +static void +raft_handle_become_leader(struct raft *raft, + const struct raft_become_leader *rq) +{ + if (raft->role == RAFT_FOLLOWER) { + char buf[SID_LEN + 1]; + VLOG_INFO("received leadership transfer from %s in term %"PRIu64, + raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf), + rq->term); + raft_start_election(raft, true); + } +} + +static void +raft_send_execute_command_reply(struct raft *raft, + const struct uuid *sid, + const struct uuid *eid, + enum raft_command_status status, + uint64_t commit_index) +{ + union raft_rpc rpc = { + .execute_command_reply = { + .common = { + .type = RAFT_RPC_EXECUTE_COMMAND_REPLY, + .sid = *sid, + }, + .result = *eid, + .status = status, + .commit_index = commit_index, + }, + }; + raft_send(raft, &rpc); +} + +static enum raft_command_status +raft_handle_execute_command_request__( + struct raft *raft, const struct raft_execute_command_request *rq) +{ + if (raft->role != RAFT_LEADER) { + return RAFT_CMD_NOT_LEADER; + } + + const struct uuid *current_eid = raft_current_eid(raft); + if (!uuid_equals(&rq->prereq, current_eid)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match " + "prerequisite "UUID_FMT" in execute_command_request", + UUID_ARGS(current_eid), UUID_ARGS(&rq->prereq)); + return RAFT_CMD_BAD_PREREQ; + } + + struct raft_command *cmd = raft_command_initiate(raft, rq->data, + NULL, &rq->result); + cmd->sid = rq->common.sid; + + enum raft_command_status status = cmd->status; + if (status != RAFT_CMD_INCOMPLETE) { + raft_command_unref(cmd); + } + return status; +} + +static void +raft_handle_execute_command_request( + struct raft *raft, const struct raft_execute_command_request *rq) +{ + enum raft_command_status status + = raft_handle_execute_command_request__(raft, rq); + if (status != RAFT_CMD_INCOMPLETE) { + raft_send_execute_command_reply(raft, &rq->common.sid, &rq->result, + status, 0); + } +} + +static void +raft_handle_execute_command_reply( + struct raft *raft, const struct raft_execute_command_reply *rpy) +{ + struct raft_command *cmd = raft_find_command_by_eid(raft, &rpy->result); + if (!cmd) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + char buf[SID_LEN + 1]; + VLOG_INFO_RL(&rl, + "%s received \"%s\" reply from %s for unknown command", + raft->local_nickname, + raft_command_status_to_string(rpy->status), + raft_get_nickname(raft, &rpy->common.sid, + buf, sizeof buf)); + return; + } + + if (rpy->status == RAFT_CMD_INCOMPLETE) { + cmd->timestamp = time_msec(); + } else { + cmd->index = rpy->commit_index; + raft_command_complete(raft, cmd, rpy->status); + } +} + +static void +raft_handle_rpc(struct raft *raft, const union raft_rpc *rpc) +{ + uint64_t term = raft_rpc_get_term(rpc); + if (term + && !raft_should_suppress_disruptive_server(raft, rpc) + && !raft_receive_term__(raft, &rpc->common, term)) { + if (rpc->type == RAFT_RPC_APPEND_REQUEST) { + /* Section 3.3: "If a server receives a request with a stale term + * number, it rejects the request." */ + raft_send_append_reply(raft, raft_append_request_cast(rpc), + RAFT_APPEND_INCONSISTENCY, "stale term"); + } + return; + } + + switch (rpc->type) { +#define RAFT_RPC(ENUM, NAME) \ + case ENUM: \ + raft_handle_##NAME(raft, &rpc->NAME); \ + break; + RAFT_RPC_TYPES +#undef RAFT_RPC + default: + OVS_NOT_REACHED(); + } +} + +static bool +raft_rpc_is_heartbeat(const union raft_rpc *rpc) +{ + return ((rpc->type == RAFT_RPC_APPEND_REQUEST + || rpc->type == RAFT_RPC_APPEND_REPLY) + && rpc->common.comment + && !strcmp(rpc->common.comment, "heartbeat")); +} + + +static bool +raft_send__(struct raft *raft, const union raft_rpc *rpc, + struct raft_conn *conn) +{ + log_rpc(rpc, "-->", conn); + return !jsonrpc_session_send( + conn->js, raft_rpc_to_jsonrpc(&raft->cid, &raft->sid, rpc)); +} + +static bool +raft_is_rpc_synced(const struct raft *raft, const union raft_rpc *rpc) +{ + uint64_t term = raft_rpc_get_term(rpc); + uint64_t index = raft_rpc_get_min_sync_index(rpc); + const struct uuid *vote = raft_rpc_get_vote(rpc); + + return (term <= raft->synced_term + && index <= raft->log_synced + && (!vote || uuid_equals(vote, &raft->synced_vote))); +} + +static bool +raft_send(struct raft *raft, const union raft_rpc *rpc) +{ + const struct uuid *dst = &rpc->common.sid; + if (uuid_equals(dst, &raft->sid)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + VLOG_WARN_RL(&rl, "attempting to send RPC to self"); + return false; + } + + struct raft_conn *conn = raft_find_conn_by_sid(raft, dst); + if (!conn) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + char buf[SID_LEN + 1]; + VLOG_DBG_RL(&rl, "%s: no connection to %s, cannot send RPC", + raft->local_nickname, + raft_get_nickname(raft, dst, buf, sizeof buf)); + return false; + } + + if (!raft_is_rpc_synced(raft, rpc)) { + raft_waiter_create(raft, RAFT_W_RPC, false)->rpc = raft_rpc_clone(rpc); + return true; + } + + return raft_send__(raft, rpc, conn); +} + +static struct raft * +raft_lookup_by_name(const char *name) +{ + struct raft *raft; + + HMAP_FOR_EACH_WITH_HASH (raft, hmap_node, hash_string(name, 0), + &all_rafts) { + if (!strcmp(raft->name, name)) { + return raft; + } + } + return NULL; +} + +static void +raft_unixctl_cid(struct unixctl_conn *conn, + int argc OVS_UNUSED, const char *argv[], + void *aux OVS_UNUSED) +{ + struct raft *raft = raft_lookup_by_name(argv[1]); + if (!raft) { + unixctl_command_reply_error(conn, "unknown cluster"); + } else if (uuid_is_zero(&raft->cid)) { + unixctl_command_reply_error(conn, "cluster id not yet known"); + } else { + char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->cid)); + unixctl_command_reply(conn, uuid); + free(uuid); + } +} + +static void +raft_unixctl_sid(struct unixctl_conn *conn, + int argc OVS_UNUSED, const char *argv[], + void *aux OVS_UNUSED) +{ + struct raft *raft = raft_lookup_by_name(argv[1]); + if (!raft) { + unixctl_command_reply_error(conn, "unknown cluster"); + } else { + char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->sid)); + unixctl_command_reply(conn, uuid); + free(uuid); + } +} + +static void +raft_put_sid(const char *title, const struct uuid *sid, + const struct raft *raft, struct ds *s) +{ + ds_put_format(s, "%s: ", title); + if (uuid_equals(sid, &raft->sid)) { + ds_put_cstr(s, "self"); + } else if (uuid_is_zero(sid)) { + ds_put_cstr(s, "unknown"); + } else { + char buf[SID_LEN + 1]; + ds_put_cstr(s, raft_get_nickname(raft, sid, buf, sizeof buf)); + } + ds_put_char(s, '\n'); +} + +static void +raft_unixctl_status(struct unixctl_conn *conn, + int argc OVS_UNUSED, const char *argv[], + void *aux OVS_UNUSED) +{ + struct raft *raft = raft_lookup_by_name(argv[1]); + if (!raft) { + unixctl_command_reply_error(conn, "unknown cluster"); + return; + } + + struct ds s = DS_EMPTY_INITIALIZER; + ds_put_format(&s, "%s\n", raft->local_nickname); + ds_put_format(&s, "Name: %s\n", raft->name); + ds_put_format(&s, "Cluster ID: "); + if (!uuid_is_zero(&raft->cid)) { + ds_put_format(&s, UUID_FMT"\n", UUID_ARGS(&raft->cid)); + } else { + ds_put_format(&s, "not yet known\n"); + } + ds_put_format(&s, "Server ID: "SID_FMT" ("UUID_FMT")\n", + SID_ARGS(&raft->sid), UUID_ARGS(&raft->sid)); + ds_put_format(&s, "Address: %s\n", raft->local_address); + ds_put_format(&s, "Status: %s\n", + raft->joining ? "joining cluster" + : raft->leaving ? "leaving cluster" + : raft->left ? "left cluster" + : raft->failed ? "failed" + : "cluster member"); + if (raft->joining) { + ds_put_format(&s, "Remotes for joining:"); + const char *address; + SSET_FOR_EACH (address, &raft->remote_addresses) { + ds_put_format(&s, " %s", address); + } + ds_put_char(&s, '\n'); + } + if (raft->role == RAFT_LEADER) { + struct raft_server *as; + HMAP_FOR_EACH (as, hmap_node, &raft->add_servers) { + ds_put_format(&s, "Adding server %s ("SID_FMT" at %s) (%s)\n", + as->nickname, SID_ARGS(&as->sid), as->address, + raft_server_phase_to_string(as->phase)); + } + + struct raft_server *rs = raft->remove_server; + if (rs) { + ds_put_format(&s, "Removing server %s ("SID_FMT" at %s) (%s)\n", + rs->nickname, SID_ARGS(&rs->sid), rs->address, + raft_server_phase_to_string(rs->phase)); + } + } + + ds_put_format(&s, "Role: %s\n", + raft->role == RAFT_LEADER ? "leader" + : raft->role == RAFT_CANDIDATE ? "candidate" + : raft->role == RAFT_FOLLOWER ? "follower" + : ""); + ds_put_format(&s, "Term: %"PRIu64"\n", raft->term); + raft_put_sid("Leader", &raft->leader_sid, raft, &s); + raft_put_sid("Vote", &raft->vote, raft, &s); + ds_put_char(&s, '\n'); + + ds_put_format(&s, "Log: [%"PRIu64", %"PRIu64"]\n", + raft->log_start, raft->log_end); + + uint64_t n_uncommitted = raft->log_end - raft->commit_index - 1; + ds_put_format(&s, "Entries not yet committed: %"PRIu64"\n", n_uncommitted); + + uint64_t n_unapplied = raft->log_end - raft->last_applied - 1; + ds_put_format(&s, "Entries not yet applied: %"PRIu64"\n", n_unapplied); + + const struct raft_conn *c; + ds_put_cstr(&s, "Connections:"); + LIST_FOR_EACH (c, list_node, &raft->conns) { + bool connected = jsonrpc_session_is_connected(c->js); + ds_put_format(&s, " %s%s%s%s", + connected ? "" : "(", + c->incoming ? "<-" : "->", c->nickname, + connected ? "" : ")"); + } + ds_put_char(&s, '\n'); + + ds_put_cstr(&s, "Servers:\n"); + struct raft_server *server; + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { + ds_put_format(&s, " %s ("SID_FMT" at %s)", + server->nickname, + SID_ARGS(&server->sid), server->address); + if (uuid_equals(&server->sid, &raft->sid)) { + ds_put_cstr(&s, " (me)"); + } + if (server->phase != RAFT_PHASE_STABLE) { + ds_put_format (&s, " (%s)", + raft_server_phase_to_string(server->phase)); + } + if (raft->role == RAFT_CANDIDATE) { + if (!uuid_is_zero(&server->vote)) { + char buf[SID_LEN + 1]; + ds_put_format(&s, " (voted for %s)", + raft_get_nickname(raft, &server->vote, + buf, sizeof buf)); + } + } else if (raft->role == RAFT_LEADER) { + ds_put_format(&s, " next_index=%"PRIu64" match_index=%"PRIu64, + server->next_index, server->match_index); + } + ds_put_char(&s, '\n'); + } + + unixctl_command_reply(conn, ds_cstr(&s)); + ds_destroy(&s); +} + +static void +raft_unixctl_leave__(struct unixctl_conn *conn, struct raft *raft) +{ + if (raft_left(raft)) { + unixctl_command_reply(conn, NULL); + } else if (raft_is_leaving(raft)) { + unixctl_command_reply_error(conn, + "already in progress leaving cluster"); + } else if (raft_is_joining(raft)) { + unixctl_command_reply_error(conn, + "can't leave while join in progress"); + } else if (raft_failed(raft)) { + unixctl_command_reply_error(conn, + "can't leave after failure"); + } else { + raft_leave(raft); + unixctl_command_reply(conn, NULL); + } +} + +static void +raft_unixctl_leave(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + bool force = argc > 2 && !strcmp(argv[1], "--force"); + if (force) { + argc--; + argv++; + } + if (argc != 2) { + unixctl_command_reply_error(conn, "syntax error"); + return; + } + + struct raft *raft = raft_lookup_by_name(argv[1]); + if (!raft) { + unixctl_command_reply_error(conn, "unknown cluster"); + return; + } + + raft_unixctl_leave__(conn, raft); +} + +static struct raft_server * +raft_lookup_server_best_match(struct raft *raft, const char *id) +{ + struct raft_server *best = NULL; + int best_score = -1; + int n_best = 0; + + struct raft_server *s; + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { + int score = (!strcmp(id, s->address) + ? INT_MAX + : uuid_is_partial_match(&s->sid, id)); + if (score > best_score) { + best = s; + best_score = score; + n_best = 1; + } else if (score == best_score) { + n_best++; + } + } + return n_best == 1 ? best : NULL; +} + +static void +raft_unixctl_kick(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[], void *aux OVS_UNUSED) +{ + const char *cluster_name = argv[1]; + const char *server_name = argv[2]; + + struct raft *raft = raft_lookup_by_name(cluster_name); + if (!raft) { + unixctl_command_reply_error(conn, "unknown cluster"); + return; + } + + struct raft_server *server = raft_lookup_server_best_match(raft, + server_name); + if (!server) { + unixctl_command_reply_error(conn, "unknown server"); + return; + } + + if (uuid_equals(&server->sid, &raft->sid)) { + raft_unixctl_leave__(conn, raft); + } else if (raft->role == RAFT_LEADER) { + const struct raft_remove_server_request rq = { + .sid = server->sid, + .requester_conn = conn, + }; + raft_handle_remove_server_request(raft, &rq); + } else { + const union raft_rpc rpc = { + .remove_server_request = { + .common = { + .type = RAFT_RPC_REMOVE_SERVER_REQUEST, + .sid = raft->leader_sid, + .comment = "via unixctl" + }, + .sid = server->sid, + } + }; + if (raft_send(raft, &rpc)) { + unixctl_command_reply(conn, "sent removal request to leader"); + } else { + unixctl_command_reply_error(conn, + "failed to send removal request"); + } + } +} + +static void +raft_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + if (!ovsthread_once_start(&once)) { + return; + } + unixctl_command_register("cluster/cid", "DB", 1, 1, + raft_unixctl_cid, NULL); + unixctl_command_register("cluster/sid", "DB", 1, 1, + raft_unixctl_sid, NULL); + unixctl_command_register("cluster/status", "DB", 1, 1, + raft_unixctl_status, NULL); + unixctl_command_register("cluster/leave", "[--force] DB", 1, 2, + raft_unixctl_leave, NULL); + unixctl_command_register("cluster/kick", "DB SERVER", 2, 2, + raft_unixctl_kick, NULL); + ovsthread_once_done(&once); +} diff --git a/ovsdb/raft.h b/ovsdb/raft.h new file mode 100644 index 000000000000..cb537bf207a0 --- /dev/null +++ b/ovsdb/raft.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2014, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RAFT_H +#define RAFT_H 1 + +#include + +/* Implementation of the Raft consensus algorithm. + * + * + * References + * ========== + * + * Based on Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and + * Practice", available at https://ramcloud.stanford.edu/~ongaro/thesis.pdf. + * References to sections, pages, and figures are from this thesis. Quotations + * in comments also come from this work, in accordance with its license notice, + * reproduced below: + * + * Copyright 2014 by Diego Andres Ongaro. All Rights Reserved. + * + * This work is licensed under a Creative Commons Attribution-3.0 United + * States License. http://creativecommons.org/licenses/by/3.0/us/ + * + * + * Concepts + * ======== + * + * Raft allows a cluster of servers to maintain a distributed log. At any + * given time, at most one of N servers is a leader. The leader can propose + * appending a new entry to the log. If ratified by more than N/2 servers + * (including the leader), the new entry becomes permanently part of the log. + * + * This implementation gives each cluster a name, which is the same as the + * database schema's name and a UUID, called the cluster ID. Each server has + * its own UUID, called the server ID, and a network address (e.g. an IP + * address and a port). + * + * + * Thread-safety + * ============= + * + * The Raft code is not thread-safe. Even if separate threads access different + * Raft objects, the implementation can still make unsynchronized cross-thread + * accesses (from unixctl handlers). + */ + +#include +#include +#include "compiler.h" +#include "uuid.h" + +struct json; +struct ovsdb_log; +struct raft; +struct sset; + +#define RAFT_MAGIC "CLUSTER" + +/* Setting up a new cluster or adding a new server to a cluster. + * + * These functions just write an on-disk file. They do not do any network + * activity, which means that the actual work of setting up or joining the + * cluster happens later after raft_open(). */ +struct ovsdb_error *raft_create_cluster(const char *file_name, + const char *name, + const char *local_address, + const struct json *snapshot) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *raft_join_cluster(const char *file_name, const char *name, + const char *local_address, + const struct sset *remote_addrs, + const struct uuid *cid) + OVS_WARN_UNUSED_RESULT; + +/* Reading metadata from a server log. */ +struct raft_metadata { + struct uuid sid; /* Server ID. */ + struct uuid cid; /* Cluster ID. All-zeros if not yet known. */ + char *name; /* Schema name. */ + char *local; /* Local address. */ +}; +struct ovsdb_error *raft_read_metadata(struct ovsdb_log *, + struct raft_metadata *) + OVS_WARN_UNUSED_RESULT; +void raft_metadata_destroy(struct raft_metadata *); + +/* Starting up or shutting down a server within a cluster. */ +struct ovsdb_error *raft_open(struct ovsdb_log *, struct raft **) + OVS_WARN_UNUSED_RESULT; +void raft_close(struct raft *); + +void raft_run(struct raft *); +void raft_wait(struct raft *); + +/* Information. */ +const char *raft_get_name(const struct raft *); +const struct uuid *raft_get_cid(const struct raft *); +const struct uuid *raft_get_sid(const struct raft *); +bool raft_is_connected(const struct raft *); +bool raft_is_leader(const struct raft *); + +/* Joining a cluster. */ +bool raft_is_joining(const struct raft *); + +/* Leaving a cluster. */ +void raft_leave(struct raft *); +bool raft_is_leaving(const struct raft *); +bool raft_left(const struct raft *); + +/* Failure. */ +bool raft_failed(const struct raft *); + +/* Reading snapshots and log entries. */ +const struct json *raft_next_entry(struct raft *, struct uuid *eid, + bool *is_snapshot); +bool raft_has_next_entry(const struct raft *); + +uint64_t raft_get_applied_index(const struct raft *); +uint64_t raft_get_commit_index(const struct raft *); + +/* Writing log entries (executing commands). */ +enum raft_command_status { + /* In progress, please wait. */ + RAFT_CMD_INCOMPLETE, + + /* Success. */ + RAFT_CMD_SUCCESS, /* Committed. */ + + /* Failure. + * + * A failure status does not always mean that the operation actually + * failed. In corner cases, it means that the log entry was committed but + * the message reporting success was not successfully received. Thus, this + * Raft implementation implements "at-least-once" rather than + * "exactly-once" semantics. */ + RAFT_CMD_NOT_LEADER, /* Failed because we are not the leader. */ + RAFT_CMD_BAD_PREREQ, /* Failed because prerequisite check failed. */ + RAFT_CMD_LOST_LEADERSHIP, /* Leadership lost after command initiation. */ + RAFT_CMD_SHUTDOWN, /* Raft server joining or left or shut down. */ + RAFT_CMD_IO_ERROR, /* I/O error. */ + RAFT_CMD_TIMEOUT, /* Request to remote leader timed out. */ +}; +const char *raft_command_status_to_string(enum raft_command_status); +bool raft_command_status_from_string(const char *, enum raft_command_status *); + +struct raft_command *raft_command_execute(struct raft *, + const struct json *data, + const struct uuid *prereq, + struct uuid *result) + OVS_WARN_UNUSED_RESULT; +enum raft_command_status raft_command_get_status(const struct raft_command *); +uint64_t raft_command_get_commit_index(const struct raft_command *); +void raft_command_unref(struct raft_command *); +void raft_command_wait(const struct raft_command *); + +/* Replacing the local log by a snapshot. */ +bool raft_grew_lots(const struct raft *); +struct ovsdb_error *raft_store_snapshot(struct raft *, + const struct json *new_snapshot) + OVS_WARN_UNUSED_RESULT; + +/* Cluster management. */ +void raft_take_leadership(struct raft *); +void raft_transfer_leadership(struct raft *, const char *reason); + +#endif /* lib/raft.h */ diff --git a/ovsdb/replication.c b/ovsdb/replication.c index bac46c67f409..9d862f7a6ea7 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -1,5 +1,5 @@ /* - * (c) Copyright 2016 Hewlett Packard Enterprise Development LP + * (c) Copyright 2016, 2017 Hewlett Packard Enterprise Development LP * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -536,7 +536,7 @@ reset_database(struct ovsdb *db) } } - return ovsdb_txn_commit(txn, false); + return ovsdb_txn_propose_commit_block(txn, false); } /* Create a monitor request for 'db'. The monitor request will include @@ -615,7 +615,7 @@ process_notification(struct json *table_updates, struct ovsdb *db) return error; } else { /* Commit transaction. */ - error = ovsdb_txn_commit(txn, false); + error = ovsdb_txn_propose_commit_block(txn, false); } } diff --git a/ovsdb/row.c b/ovsdb/row.c index 9c312947e539..755ab91a8c1c 100644 --- a/ovsdb/row.c +++ b/ovsdb/row.c @@ -44,6 +44,9 @@ allocate_row(const struct ovsdb_table *table) return row; } +/* Creates and returns a new row suitable for insertion into 'table'. Does not + * actually insert the row into 'table' (use ovsdb_txn_row_insert()). The + * caller must assign a UUID to the row. */ struct ovsdb_row * ovsdb_row_create(const struct ovsdb_table *table) { diff --git a/ovsdb/server.c b/ovsdb/server.c index 2a775230da6a..e1a497d78897 100644 --- a/ovsdb/server.c +++ b/ovsdb/server.c @@ -131,20 +131,14 @@ ovsdb_server_init(struct ovsdb_server *server) bool ovsdb_server_add_db(struct ovsdb_server *server, struct ovsdb *db) { - return shash_add_once(&server->dbs, db->schema->name, db); + return shash_add_once(&server->dbs, db->name, db); } -/* Removes 'db' from the set of databases served out by 'server'. Returns - * true if successful, false if there is no db associated with - * db->schema->name. */ -bool +/* Removes 'db' from the set of databases served out by 'server'. */ +void ovsdb_server_remove_db(struct ovsdb_server *server, struct ovsdb *db) { - void *data = shash_find_and_delete(&server->dbs, db->schema->name); - if (data) { - return true; - } - return false; + shash_find_and_delete_assert(&server->dbs, db->name); } /* Destroys 'server'. */ diff --git a/ovsdb/server.h b/ovsdb/server.h index 21bf1adde7af..6d997e608e66 100644 --- a/ovsdb/server.h +++ b/ovsdb/server.h @@ -86,7 +86,7 @@ struct ovsdb_server { void ovsdb_server_init(struct ovsdb_server *); bool ovsdb_server_add_db(struct ovsdb_server *, struct ovsdb *); -bool ovsdb_server_remove_db(struct ovsdb_server *, struct ovsdb *); +void ovsdb_server_remove_db(struct ovsdb_server *, struct ovsdb *); void ovsdb_server_destroy(struct ovsdb_server *); struct ovsdb_lock_waiter *ovsdb_server_lock(struct ovsdb_server *, diff --git a/ovsdb/storage.c b/ovsdb/storage.c new file mode 100644 index 000000000000..34590f374188 --- /dev/null +++ b/ovsdb/storage.c @@ -0,0 +1,574 @@ +/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this storage except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "storage.h" +#include +#include "log.h" +#include "ovsdb-error.h" +#include "openvswitch/json.h" +#include "openvswitch/poll-loop.h" +#include "openvswitch/vlog.h" +#include "ovsdb.h" +#include "raft.h" +#include "random.h" +#include "timeval.h" +#include "util.h" + +VLOG_DEFINE_THIS_MODULE(storage); + +struct ovsdb_storage { + /* There are three kinds of storage: + * + * - Standalone, backed by a disk file. 'log' is nonnull, 'raft' is + * null. + * + * - Clustered, backed by a Raft cluster. 'log' is null, 'raft' is + * nonnull. + * + * - Memory only, unbacked. 'log' and 'raft' are null. */ + struct ovsdb_log *log; + struct raft *raft; + + /* All kinds of storage. */ + struct ovsdb_error *error; /* If nonnull, a permanent error. */ + long long next_snapshot; /* Time at which to take next snapshot. */ + + /* Standalone only. */ + unsigned int n_read; + unsigned int n_written; +}; + +static long long int next_snapshot_time(bool quick); + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_open__(const char *filename, bool rw, bool allow_clustered, + struct ovsdb_storage **storagep) +{ + *storagep = NULL; + + struct ovsdb_log *log; + struct ovsdb_error *error; + error = ovsdb_log_open(filename, OVSDB_MAGIC"|"RAFT_MAGIC, + rw ? OVSDB_LOG_READ_WRITE : OVSDB_LOG_READ_ONLY, + -1, &log); + if (error) { + return error; + } + + struct raft *raft = NULL; + if (!strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) { + if (!allow_clustered) { + ovsdb_log_close(log); + return ovsdb_error(NULL, "%s: cannot apply this operation to " + "clustered database file", filename); + } + error = raft_open(log, &raft); + log = NULL; + if (error) { + return error; + } + } + + struct ovsdb_storage *storage = xzalloc(sizeof *storage); + storage->log = log; + storage->raft = raft; + storage->next_snapshot = next_snapshot_time(false); + *storagep = storage; + return NULL; +} + +/* Opens 'filename' for use as storage. If 'rw', opens it for read/write + * access, otherwise read-only. If successful, stores the new storage in + * '*storagep' and returns NULL; on failure, stores NULL in '*storagep' and + * returns the error. + * + * The returned storage might be clustered or standalone, depending on what the + * disk file contains. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_open(const char *filename, bool rw, + struct ovsdb_storage **storagep) +{ + return ovsdb_storage_open__(filename, rw, true, storagep); +} + +/* Creates and returns new storage without any backing. Nothing will be read + * from the storage, and writes are discarded. */ +struct ovsdb_storage * +ovsdb_storage_create_unbacked(void) +{ + struct ovsdb_storage *storage = xzalloc(sizeof *storage); + storage->next_snapshot = LLONG_MAX; + return storage; +} + +void +ovsdb_storage_close(struct ovsdb_storage *storage) +{ + if (storage) { + ovsdb_log_close(storage->log); + raft_close(storage->raft); + ovsdb_error_destroy(storage->error); + free(storage); + } +} + +const char * +ovsdb_storage_get_model(const struct ovsdb_storage *storage) +{ + return storage->raft ? "clustered" : "standalone"; +} + +bool +ovsdb_storage_is_clustered(const struct ovsdb_storage *storage) +{ + return storage->raft != NULL; +} + +bool +ovsdb_storage_is_connected(const struct ovsdb_storage *storage) +{ + return !storage->raft || raft_is_connected(storage->raft); +} + +bool +ovsdb_storage_is_dead(const struct ovsdb_storage *storage) +{ + return storage->raft && raft_left(storage->raft); +} + +bool +ovsdb_storage_is_leader(const struct ovsdb_storage *storage) +{ + return !storage->raft || raft_is_leader(storage->raft); +} + +const struct uuid * +ovsdb_storage_get_cid(const struct ovsdb_storage *storage) +{ + return storage->raft ? raft_get_cid(storage->raft) : NULL; +} + +const struct uuid * +ovsdb_storage_get_sid(const struct ovsdb_storage *storage) +{ + return storage->raft ? raft_get_sid(storage->raft) : NULL; +} + +uint64_t +ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage) +{ + return storage->raft ? raft_get_applied_index(storage->raft) : 0; +} + +void +ovsdb_storage_run(struct ovsdb_storage *storage) +{ + if (storage->raft) { + raft_run(storage->raft); + } +} + +void +ovsdb_storage_wait(struct ovsdb_storage *storage) +{ + if (storage->raft) { + raft_wait(storage->raft); + } +} + +/* Returns 'storage''s embedded name, if it has one, otherwise null. + * + * Only clustered storage has a built-in name. */ +const char * +ovsdb_storage_get_name(const struct ovsdb_storage *storage) +{ + return storage->raft ? raft_get_name(storage->raft) : NULL; +} + +/* Attempts to read a log record from 'storage'. + * + * If successful, returns NULL and stores in '*jsonp' the JSON object that the + * record contains. The caller owns the data and must eventually free it (with + * json_destroy()). + * + * If a read error occurs, returns the error and stores NULL in '*jsonp'. + * + * If the read reaches end of file, returns NULL and stores NULL in + * '*jsonp'. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_read(struct ovsdb_storage *storage, + struct ovsdb_schema **schemap, + struct json **txnp, + struct uuid *txnid) +{ + *schemap = NULL; + *txnp = NULL; + if (txnid) { + *txnid = UUID_ZERO; + } + + struct json *json; + struct json *schema_json = NULL; + struct json *txn_json = NULL; + if (storage->raft) { + bool is_snapshot; + json = json_nullable_clone( + raft_next_entry(storage->raft, txnid, &is_snapshot)); + if (!json) { + return NULL; + } else if (json->type != JSON_ARRAY || json->u.array.n != 2) { + json_destroy(json); + return ovsdb_error(NULL, "invalid commit format"); + } + + struct json **e = json->u.array.elems; + schema_json = e[0]->type != JSON_NULL ? e[0] : NULL; + txn_json = e[1]->type != JSON_NULL ? e[1] : NULL; + } else if (storage->log) { + struct ovsdb_error *error = ovsdb_log_read(storage->log, &json); + if (error || !json) { + return error; + } + + unsigned int n = storage->n_read++; + struct json **jsonp = !n ? &schema_json : &txn_json; + *jsonp = json; + if (n == 1) { + ovsdb_log_mark_base(storage->log); + } + } else { + /* Unbacked. Nothing to do. */ + return NULL; + } + + /* If we got this far then we must have at least a schema or a + * transaction. */ + ovs_assert(schema_json || txn_json); + + if (schema_json) { + struct ovsdb_schema *schema; + struct ovsdb_error *error = ovsdb_schema_from_json(schema_json, + &schema); + if (error) { + json_destroy(json); + return error; + } + + const char *storage_name = ovsdb_storage_get_name(storage); + const char *schema_name = schema->name; + if (storage_name && strcmp(storage_name, schema_name)) { + error = ovsdb_error(NULL, "name %s in header does not match " + "name %s in schema", + storage_name, schema_name); + json_destroy(json); + ovsdb_schema_destroy(schema); + return error; + } + + *schemap = schema; + } + + if (txn_json) { + *txnp = json_clone(txn_json); + } + + json_destroy(json); + return NULL; +} + +bool +ovsdb_storage_read_wait(struct ovsdb_storage *storage) +{ + if (storage->raft) { + return raft_has_next_entry(storage->raft); + } else { + /* XXX */ + return false; + } +} + +void +ovsdb_storage_unread(struct ovsdb_storage *storage) +{ + if (storage->error) { + return; + } + + if (storage->raft) { + if (!storage->error) { + storage->error = ovsdb_error(NULL, "inconsistent data"); + } + } else if (storage->log) { + ovsdb_log_unread(storage->log); + } +} + +struct ovsdb_write { + struct ovsdb_error *error; + struct raft_command *command; +}; + +/* Not suitable for writing transactions that change the schema. */ +struct ovsdb_write * OVS_WARN_UNUSED_RESULT +ovsdb_storage_write(struct ovsdb_storage *storage, const struct json *data, + const struct uuid *prereq, struct uuid *resultp, + bool durable) +{ + struct ovsdb_write *w = xzalloc(sizeof *w); + struct uuid result = UUID_ZERO; + if (storage->error) { + w->error = ovsdb_error_clone(storage->error); + } else if (storage->raft) { + struct json *txn_json = json_array_create_2(json_null_create(), + json_clone(data)); + w->command = raft_command_execute(storage->raft, txn_json, + prereq, &result); + json_destroy(txn_json); + } else if (storage->log) { + w->error = ovsdb_log_write(storage->log, data); + if (!w->error) { + storage->n_written++; + if (durable) { + w->error = ovsdb_log_commit_block(storage->log); + } + } + } else { + /* When 'error' and 'command' are both null, it indicates that the + * command is complete. This is fine since this unbacked storage drops + * writes. */ + } + if (resultp) { + *resultp = result; + } + return w; +} + +/* Not suitable for writing transactions that change the schema. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_write_block(struct ovsdb_storage *storage, + const struct json *data, const struct uuid *prereq, + struct uuid *resultp, bool durable) +{ + struct ovsdb_write *w = ovsdb_storage_write(storage, data, + prereq, resultp, durable); + while (!ovsdb_write_is_complete(w)) { + if (storage->raft) { + raft_run(storage->raft); + } + + ovsdb_write_wait(w); + if (storage->raft) { + raft_wait(storage->raft); + } + poll_block(); + } + + struct ovsdb_error *error = ovsdb_error_clone(ovsdb_write_get_error(w)); + ovsdb_write_destroy(w); + return error; +} + +bool +ovsdb_write_is_complete(const struct ovsdb_write *w) +{ + return (w->error + || !w->command + || raft_command_get_status(w->command) != RAFT_CMD_INCOMPLETE); +} + +const struct ovsdb_error * +ovsdb_write_get_error(const struct ovsdb_write *w_) +{ + struct ovsdb_write *w = CONST_CAST(struct ovsdb_write *, w_); + ovs_assert(ovsdb_write_is_complete(w)); + + if (w->command && !w->error) { + enum raft_command_status status = raft_command_get_status(w->command); + if (status != RAFT_CMD_SUCCESS) { + w->error = ovsdb_error("cluster error", "%s", + raft_command_status_to_string(status)); + } + } + + return w->error; +} + +uint64_t +ovsdb_write_get_commit_index(const struct ovsdb_write *w) +{ + ovs_assert(ovsdb_write_is_complete(w)); + return (w->command && !w->error + ? raft_command_get_commit_index(w->command) + : 0); +} + +void +ovsdb_write_wait(const struct ovsdb_write *w) +{ + if (ovsdb_write_is_complete(w)) { + poll_immediate_wake(); + } +} + +void +ovsdb_write_destroy(struct ovsdb_write *w) +{ + if (w) { + raft_command_unref(w->command); + ovsdb_error_destroy(w->error); + free(w); + } +} + +static long long int +next_snapshot_time(bool quick) +{ + unsigned int base = 10 * 60 * 1000; /* 10 minutes */ + unsigned int range = 10 * 60 * 1000; /* 10 minutes */ + if (quick) { + base /= 10; + range /= 10; + } + + return time_msec() + base + random_range(range); +} + +bool +ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage) +{ + if (time_msec() < storage->next_snapshot) { + return false; + } + + if (storage->raft) { + return raft_grew_lots(storage->raft); + } else if (storage->log) { + return (storage->n_read + storage->n_written >= 100 + && ovsdb_log_grew_lots(storage->log)); + } + + return false; +} + +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_store_snapshot__(struct ovsdb_storage *storage, + const struct json *schema, + const struct json *data) +{ + if (storage->raft) { + struct json *entries = json_array_create_empty(); + if (schema) { + json_array_add(entries, json_clone(schema)); + } + if (data) { + json_array_add(entries, json_clone(data)); + } + struct ovsdb_error *error = raft_store_snapshot(storage->raft, + entries); + json_destroy(entries); + return error; + } else if (storage->log) { + struct json *entries[2]; + size_t n = 0; + if (schema) { + entries[n++] = CONST_CAST(struct json *, schema); + } + if (data) { + entries[n++] = CONST_CAST(struct json *, data); + } + return ovsdb_log_replace(storage->log, entries, n); + } else { + return NULL; + } +} + +/* 'schema' and 'data' should faithfully represent the current schema and data, + * otherwise the two storing backing formats will yield divergent results. Use + * ovsdb_storage_write_schema_change() to change the schema. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, + const struct json *schema, + const struct json *data) +{ + struct ovsdb_error *error = ovsdb_storage_store_snapshot__(storage, + schema, data); + bool retry_quickly = error != NULL; + storage->next_snapshot = next_snapshot_time(retry_quickly); + return error; +} + +struct ovsdb_write * OVS_WARN_UNUSED_RESULT +ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, + const struct json *schema, + const struct json *data, + const struct uuid *prereq, + struct uuid *resultp) +{ + struct ovsdb_write *w = xzalloc(sizeof *w); + struct uuid result = UUID_ZERO; + if (storage->error) { + w->error = ovsdb_error_clone(storage->error); + } else if (storage->raft) { + struct json *txn_json = json_array_create_2(json_clone(schema), + json_clone(data)); + w->command = raft_command_execute(storage->raft, txn_json, + prereq, &result); + json_destroy(txn_json); + } else if (storage->log) { + w->error = ovsdb_storage_store_snapshot__(storage, schema, data); + } else { + /* When 'error' and 'command' are both null, it indicates that the + * command is complete. This is fine since this unbacked storage drops + * writes. */ + } + if (resultp) { + *resultp = result; + } + return w; +} + +struct ovsdb_storage * +ovsdb_storage_open_standalone(const char *filename, bool rw) +{ + struct ovsdb_storage *storage; + struct ovsdb_error *error = ovsdb_storage_open__(filename, rw, false, + &storage); + if (error) { + ovs_fatal(0, "%s", ovsdb_error_to_string_free(error)); + } + return storage; +} + +struct ovsdb_schema * +ovsdb_storage_read_schema(struct ovsdb_storage *storage) +{ + ovs_assert(storage->log); + + struct json *txn_json; + struct ovsdb_schema *schema; + struct ovsdb_error *error = ovsdb_storage_read(storage, &schema, + &txn_json, NULL); + if (error) { + ovs_fatal(0, "%s", ovsdb_error_to_string_free(error)); + } + if (!schema && !txn_json) { + ovs_fatal(0, "unexpected end of file reading schema"); + } + ovs_assert(schema && !txn_json); + + return schema; +} diff --git a/ovsdb/storage.h b/ovsdb/storage.h new file mode 100644 index 000000000000..cb5bcb656080 --- /dev/null +++ b/ovsdb/storage.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this storage except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OVSDB_STORAGE_H +#define OVSDB_STORAGE_H 1 + +#include +#include +#include "compiler.h" + +struct json; +struct ovsdb_schema; +struct ovsdb_storage; +struct ovsdb_completion; +struct uuid; + +struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw, + struct ovsdb_storage **) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_storage *ovsdb_storage_create_unbacked(void); +void ovsdb_storage_close(struct ovsdb_storage *); + +const char *ovsdb_storage_get_model(const struct ovsdb_storage *); +bool ovsdb_storage_is_clustered(const struct ovsdb_storage *); +bool ovsdb_storage_is_connected(const struct ovsdb_storage *); +bool ovsdb_storage_is_dead(const struct ovsdb_storage *); +bool ovsdb_storage_is_leader(const struct ovsdb_storage *); +const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *); +const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *); +uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *); + +void ovsdb_storage_run(struct ovsdb_storage *); +void ovsdb_storage_wait(struct ovsdb_storage *); + +const char *ovsdb_storage_get_name(const struct ovsdb_storage *); + +struct ovsdb_error *ovsdb_storage_read(struct ovsdb_storage *, + struct ovsdb_schema **schemap, + struct json **txnp, + struct uuid *txnid) + OVS_WARN_UNUSED_RESULT; +bool ovsdb_storage_read_wait(struct ovsdb_storage *); + +void ovsdb_storage_unread(struct ovsdb_storage *); + +struct ovsdb_write *ovsdb_storage_write(struct ovsdb_storage *, + const struct json *, + const struct uuid *prereq, + struct uuid *result, + bool durable) + OVS_WARN_UNUSED_RESULT; +struct ovsdb_error *ovsdb_storage_write_block(struct ovsdb_storage *, + const struct json *, + const struct uuid *prereq, + struct uuid *result, + bool durable); + +bool ovsdb_write_is_complete(const struct ovsdb_write *); +const struct ovsdb_error *ovsdb_write_get_error(const struct ovsdb_write *); +uint64_t ovsdb_write_get_commit_index(const struct ovsdb_write *); +void ovsdb_write_wait(const struct ovsdb_write *); +void ovsdb_write_destroy(struct ovsdb_write *); + +bool ovsdb_storage_should_snapshot(const struct ovsdb_storage *); +struct ovsdb_error *ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, + const struct json *schema, + const struct json *snapshot) + OVS_WARN_UNUSED_RESULT; + +struct ovsdb_write *ovsdb_storage_write_schema_change( + struct ovsdb_storage *, + const struct json *schema, const struct json *data, + const struct uuid *prereq, struct uuid *result) + OVS_WARN_UNUSED_RESULT; + +/* Convenience functions for ovsdb-tool and other command-line utilities, + * for use with standalone database files only, which terminate the process + * on error. */ +struct ovsdb_storage *ovsdb_storage_open_standalone(const char *filename, + bool rw); +struct ovsdb_schema *ovsdb_storage_read_schema(struct ovsdb_storage *); + +#endif /* ovsdb/storage.h */ diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 893ea1152c5a..de3cb5995af1 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -25,13 +25,17 @@ #include "openvswitch/hmap.h" #include "openvswitch/json.h" #include "openvswitch/list.h" +#include "openvswitch/poll-loop.h" +#include "openvswitch/vlog.h" #include "ovsdb-error.h" #include "ovsdb.h" #include "row.h" +#include "storage.h" #include "table.h" -#include "perf-counter.h" #include "uuid.h" +VLOG_DEFINE_THIS_MODULE(transaction); + struct ovsdb_txn { struct ovsdb *db; struct ovs_list txn_tables; /* Contains "struct ovsdb_txn_table"s. */ @@ -812,8 +816,8 @@ ovsdb_txn_is_empty(const struct ovsdb_txn *txn) return ovs_list_is_empty(&txn->txn_tables); } -struct ovsdb_error * OVS_WARN_UNUSED_RESULT -ovsdb_txn_start_commit(struct ovsdb_txn *txn) +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_txn_precommit(struct ovsdb_txn *txn) { struct ovsdb_error *error; @@ -824,7 +828,7 @@ ovsdb_txn_start_commit(struct ovsdb_txn *txn) ovsdb_txn_abort(txn); return OVSDB_WRAP_BUG("can't happen", error); } - if (ovsdb_txn_is_empty(txn)) { + if (ovs_list_is_empty(&txn->txn_tables)) { return NULL; } @@ -865,41 +869,193 @@ ovsdb_txn_start_commit(struct ovsdb_txn *txn) return OVSDB_WRAP_BUG("can't happen", error); } - return NULL; + return error; } -struct ovsdb_error * -ovsdb_txn_finish_commit(struct ovsdb_txn *txn, bool durable) +/* Finalize commit. */ +void +ovsdb_txn_complete(struct ovsdb_txn *txn) { - /* Send the commit to each replica. */ - if (txn->db->file) { - struct ovsdb_error *error = ovsdb_file_commit(txn->db->file, txn, - durable); - if (error) { - ovsdb_txn_abort(txn); + if (!ovsdb_txn_is_empty(txn)) { + txn->db->run_triggers = true; + ovsdb_monitors_commit(txn->db, txn); + ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs)); + ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit)); + } + ovsdb_txn_free(txn); +} + +/* Applies 'txn' to the internal representation of the database. This is for + * transactions that don't need to be written to storage; probably, they came + * from storage. These transactions shouldn't ordinarily fail because storage + * should contain only consistent transactions. (One exception is for database + * conversion in ovsdb_convert().) */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_txn_replay_commit(struct ovsdb_txn *txn) +{ + struct ovsdb_error *error = ovsdb_txn_precommit(txn); + if (error) { + ovsdb_txn_abort(txn); + } else { + ovsdb_txn_complete(txn); + } + return error; +} + +/* If 'error' is nonnull, the transaction is complete, with the given error as + * the result. + * + * Otherwise, if 'write' is nonnull, then the transaction is waiting for + * 'write' to complete. + * + * Otherwise, if 'commit_index' is nonzero, then the transaction is waiting for + * 'commit_index' to be applied to the storage. + * + * Otherwise, the transaction is complete and successful. */ +struct ovsdb_txn_progress { + struct ovsdb_error *error; + struct ovsdb_write *write; + uint64_t commit_index; + + struct ovsdb_storage *storage; +}; + +struct ovsdb_txn_progress * +ovsdb_txn_propose_schema_change(struct ovsdb *db, + const struct json *schema, + const struct json *data) +{ + struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress); + progress->storage = db->storage; + + struct uuid next; + struct ovsdb_write *write = ovsdb_storage_write_schema_change( + db->storage, schema, data, &db->prereq, &next); + if (!ovsdb_write_is_complete(write)) { + progress->write = write; + } else { + progress->error = ovsdb_error_clone(ovsdb_write_get_error(write)); + ovsdb_write_destroy(write); + } + return progress; +} + +struct ovsdb_txn_progress * +ovsdb_txn_propose_commit(struct ovsdb_txn *txn, bool durable) +{ + struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress); + progress->storage = txn->db->storage; + progress->error = ovsdb_txn_precommit(txn); + if (progress->error) { + return progress; + } + + /* Turn the commit into the format used for the storage logs.. */ + struct json *txn_json = ovsdb_file_txn_to_json(txn); + if (!txn_json) { + /* Nothing to do, so success. */ + return progress; + } + txn_json = ovsdb_file_txn_annotate(txn_json, ovsdb_txn_get_comment(txn)); + + struct uuid next; + struct ovsdb_write *write = ovsdb_storage_write( + txn->db->storage, txn_json, &txn->db->prereq, &next, durable); + json_destroy(txn_json); + if (!ovsdb_write_is_complete(write)) { + progress->write = write; + } else { + progress->error = ovsdb_error_clone(ovsdb_write_get_error(write)); + ovsdb_write_destroy(write); + } + return progress; +} + +/* Proposes 'txn' for commitment and then waits for the commit to succeed or + * fail Returns null if successful, otherwise the error. + * + * **In addition**, this function also completes or aborts the transaction if + * the transaction succeeded or failed, respectively. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_txn_propose_commit_block(struct ovsdb_txn *txn, bool durable) +{ + struct ovsdb_txn_progress *p = ovsdb_txn_propose_commit(txn, durable); + for (;;) { + ovsdb_storage_run(p->storage); + if (ovsdb_txn_progress_is_complete(p)) { + struct ovsdb_error *error + = ovsdb_error_clone(ovsdb_txn_progress_get_error(p)); + ovsdb_txn_progress_destroy(p); + + if (error) { + ovsdb_txn_abort(txn); + } else { + ovsdb_txn_complete(txn); + } + return error; } + ovsdb_storage_wait(p->storage); + poll_block(); } - ovsdb_monitors_commit(txn->db, txn); +} - /* Finalize commit. */ - txn->db->run_triggers = true; - ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs)); - ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit)); - ovsdb_txn_free(txn); +static void +ovsdb_txn_progress_run(struct ovsdb_txn_progress *p) +{ + if (p->error) { + return; + } - return NULL; + if (p->write) { + if (!ovsdb_write_is_complete(p->write)) { + return; + } + p->error = ovsdb_error_clone(ovsdb_write_get_error(p->write)); + p->commit_index = ovsdb_write_get_commit_index(p->write); + ovsdb_write_destroy(p->write); + p->write = NULL; + + if (p->error) { + return; + } + } + + if (p->commit_index) { + if (ovsdb_storage_get_applied_index(p->storage) >= p->commit_index) { + p->commit_index = 0; + } + } } -struct ovsdb_error * -ovsdb_txn_commit(struct ovsdb_txn *txn, bool durable) +static bool +ovsdb_txn_progress_is_complete__(const struct ovsdb_txn_progress *p) { - struct ovsdb_error *error = ovsdb_txn_start_commit(txn); - if (error || ovsdb_txn_is_empty(txn)) { - ovsdb_txn_abort(txn); - return error; + return p->error || (!p->write && !p->commit_index); +} + +bool +ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *p) +{ + ovsdb_txn_progress_run(CONST_CAST(struct ovsdb_txn_progress *, p)); + return ovsdb_txn_progress_is_complete__(p); +} + +const struct ovsdb_error * +ovsdb_txn_progress_get_error(const struct ovsdb_txn_progress *p) +{ + ovs_assert(ovsdb_txn_progress_is_complete__(p)); + return p->error; +} + +void +ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *p) +{ + if (p) { + ovsdb_error_destroy(p->error); + ovsdb_write_destroy(p->write); + free(p); } - return ovsdb_txn_finish_commit(txn, durable); } void diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index f9b886411bf4..32384fcd3502 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2010 Nicira, Inc. +/* Copyright (c) 2009, 2010, 2017 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,12 +27,23 @@ struct uuid; struct ovsdb_txn *ovsdb_txn_create(struct ovsdb *); void ovsdb_txn_abort(struct ovsdb_txn *); -struct ovsdb_error *ovsdb_txn_start_commit(struct ovsdb_txn *) +struct ovsdb_error *ovsdb_txn_replay_commit(struct ovsdb_txn *) OVS_WARN_UNUSED_RESULT; -struct ovsdb_error *ovsdb_txn_finish_commit(struct ovsdb_txn *, bool durable) +struct ovsdb_txn_progress *ovsdb_txn_propose_commit(struct ovsdb_txn *, + bool durable) OVS_WARN_UNUSED_RESULT; -struct ovsdb_error *ovsdb_txn_commit(struct ovsdb_txn *, bool durable) +struct ovsdb_error *ovsdb_txn_propose_commit_block(struct ovsdb_txn *, + bool durable) OVS_WARN_UNUSED_RESULT; +void ovsdb_txn_complete(struct ovsdb_txn *); + +struct ovsdb_txn_progress *ovsdb_txn_propose_schema_change( + struct ovsdb *, const struct json *schema, const struct json *data); + +bool ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *); +const struct ovsdb_error *ovsdb_txn_progress_get_error( + const struct ovsdb_txn_progress *); +void ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *); struct ovsdb_row *ovsdb_txn_row_modify(struct ovsdb_txn *, const struct ovsdb_row *); diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 346db7b5fb28..543b31d68dd9 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -18,20 +18,25 @@ #include "trigger.h" #include +#include #include "file.h" -#include "log.h" #include "openvswitch/json.h" #include "jsonrpc.h" #include "ovsdb.h" #include "ovsdb-error.h" #include "openvswitch/poll-loop.h" #include "server.h" +#include "transaction.h" +#include "openvswitch/vlog.h" #include "util.h" +VLOG_DEFINE_THIS_MODULE(trigger); static bool ovsdb_trigger_try(struct ovsdb_trigger *, long long int now); -static void trigger_error(struct ovsdb_trigger *, struct ovsdb_error *); +static void ovsdb_trigger_complete(struct ovsdb_trigger *); +static void trigger_convert_error(struct ovsdb_trigger *, + struct ovsdb_error *); static void trigger_success(struct ovsdb_trigger *, struct json *result); bool @@ -47,6 +52,7 @@ ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db, ovs_list_push_back(&trigger->db->triggers, &trigger->node); trigger->request = request; trigger->reply = NULL; + trigger->progress = NULL; trigger->created = now; trigger->timeout_msec = LLONG_MAX; trigger->read_only = read_only; @@ -58,6 +64,7 @@ ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db, void ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) { + ovsdb_txn_progress_destroy(trigger->progress); ovs_list_remove(&trigger->node); jsonrpc_msg_destroy(trigger->request); jsonrpc_msg_destroy(trigger->reply); @@ -68,7 +75,7 @@ ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) bool ovsdb_trigger_is_complete(const struct ovsdb_trigger *trigger) { - return trigger->reply != NULL; + return trigger->reply && !trigger->progress; } struct jsonrpc_msg * @@ -79,21 +86,50 @@ ovsdb_trigger_steal_reply(struct ovsdb_trigger *trigger) return reply; } +/* Returns a JSON-RPC message that may be sent to a client to indicate that + * 'trigger' was canceled. */ void -ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger) +ovsdb_trigger_cancel(struct ovsdb_trigger *trigger, const char *reason) { + if (trigger->progress) { + /* XXX The transaction still might complete asynchronously. */ + ovsdb_txn_progress_destroy(trigger->progress); + trigger->progress = NULL; + } + + jsonrpc_msg_destroy(trigger->reply); + trigger->reply = NULL; + if (!strcmp(trigger->request->method, "transact")) { - trigger_error(trigger, ovsdb_error("canceled", NULL)); + /* There's no place to stick 'reason' into the error reply because RFC + * 7047 prescribes a fix form for these messages, see section 4.1.4. */ + trigger->reply = jsonrpc_create_error(json_string_create("canceled"), + trigger->request->id); } else if (!strcmp(trigger->request->method, "convert")) { - /* We don't cancel "convert" requests when a database is being replaced - * for two reasons. First, we expect the administrator to do some kind - * of sensible synchronization on conversion requests, that is, it only - * really makes sense for the admin to do a single conversion at a time - * at a scheduled point. Second, if we did then every "convert" - * request would end up getting canceled since "convert" itself causes - * the database to be replaced. */ - } else { - OVS_NOT_REACHED(); + trigger_convert_error( + trigger, + ovsdb_error("canceled", "database conversion canceled because %s", + reason)); + } +} + +void +ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger) +{ + if (!ovsdb_trigger_is_complete(trigger)) { + if (!strcmp(trigger->request->method, "transact")) { + ovsdb_trigger_cancel(trigger, "database schema is changing"); + } else if (!strcmp(trigger->request->method, "convert")) { + /* We don't cancel "convert" requests when a database is being + * replaced for two reasons. First, we expect the administrator to + * do some kind of sensible synchronization on conversion requests, + * that is, it only really makes sense for the admin to do a single + * conversion at a time at a scheduled point. Second, if we did + * then every "convert" request would end up getting canceled since + * "convert" itself causes the database to be replaced. */ + } else { + OVS_NOT_REACHED(); + } } } @@ -108,7 +144,9 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) bool disconnect_all = false; LIST_FOR_EACH_SAFE (t, next, node, &db->triggers) { - if (run_triggers || now - t->created >= t->timeout_msec) { + if (run_triggers + || now - t->created >= t->timeout_msec + || t->progress) { if (ovsdb_trigger_try(t, now)) { disconnect_all = true; } @@ -147,81 +185,190 @@ ovsdb_trigger_wait(struct ovsdb *db, long long int now) static bool ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) { - if (!strcmp(t->request->method, "transact")) { - struct json *result = ovsdb_execute(t->db, t->session, - t->request->params, t->read_only, - t->role, t->id, now - t->created, - &t->timeout_msec); - if (result) { - trigger_success(t, result); - } - return false; - } else if (!strcmp(t->request->method, "convert")) { - /* Permission check. */ - if (t->role && *t->role) { - trigger_error(t, ovsdb_perm_error( - "RBAC rules for client \"%s\" role \"%s\" " - "prohibit \"convert\" of database %s " - "(only the root role may convert databases)", - t->id, t->role, t->db->schema->name)); - return false; + /* Handle "initialized" state. */ + if (!t->reply) { + ovs_assert(!t->progress); + + struct ovsdb_txn *txn = NULL; + struct ovsdb *newdb = NULL; + if (!strcmp(t->request->method, "transact")) { + bool durable; + + struct json *result; + txn = ovsdb_execute_compose( + t->db, t->session, t->request->params, t->read_only, + t->role, t->id, now - t->created, &t->timeout_msec, + &durable, &result); + if (!txn) { + if (result) { + /* Complete. There was an error but we still represent it + * in JSON-RPC as a successful result. */ + trigger_success(t, result); + } else { + /* Unsatisfied "wait" condition. Take no action now, retry + * later. */ + } + return false; + } + + /* Transition to "committing" state. */ + t->reply = jsonrpc_create_reply(result, t->request->id); + t->progress = ovsdb_txn_propose_commit(txn, durable); + } else if (!strcmp(t->request->method, "convert")) { + /* Permission check. */ + if (t->role && *t->role) { + trigger_convert_error( + t, ovsdb_perm_error( + "RBAC rules for client \"%s\" role \"%s\" prohibit " + "\"convert\" of database %s " + "(only the root role may convert databases)", + t->id, t->role, t->db->schema->name)); + return false; + } + + /* Validate parameters. */ + const struct json *params = t->request->params; + if (params->type != JSON_ARRAY || params->u.array.n != 2) { + trigger_convert_error(t, ovsdb_syntax_error(params, NULL, + "array expected")); + return false; + } + + /* Parse new schema and make a converted copy. */ + const struct json *new_schema_json = params->u.array.elems[1]; + struct ovsdb_schema *new_schema; + struct ovsdb_error *error + = ovsdb_schema_from_json(new_schema_json, &new_schema); + if (!error && strcmp(new_schema->name, t->db->schema->name)) { + error = ovsdb_error("invalid parameters", + "new schema name (%s) does not match " + "database name (%s)", + new_schema->name, t->db->schema->name); + } + if (!error) { + error = ovsdb_convert(t->db, new_schema, &newdb); + } + if (error) { + ovsdb_schema_destroy(new_schema); + trigger_convert_error(t, error); + return false; + } + + /* Make the new copy into a transaction log record. */ + struct json *txn_json = ovsdb_to_txn_json( + newdb, "converted by ovsdb-server"); + + /* Propose the change. */ + t->progress = ovsdb_txn_propose_schema_change( + t->db, new_schema_json, txn_json); + json_destroy(txn_json); + t->reply = jsonrpc_create_reply(json_object_create(), + t->request->id); + } else { + OVS_NOT_REACHED(); } - /* Validate parameters. */ - const struct json *params = t->request->params; - if (params->type != JSON_ARRAY || params->u.array.n != 2) { - trigger_error(t, ovsdb_syntax_error(params, NULL, - "array expected")); + /* If the transaction committed synchronously, complete it and + * transition to "complete". This is more than an optimization because + * the file-based storage isn't implemented to read back the + * transactions that we write (which is an ugly broken abstraction but + * it's what we have). */ + if (ovsdb_txn_progress_is_complete(t->progress) + && !ovsdb_txn_progress_get_error(t->progress)) { + if (txn) { + ovsdb_txn_complete(txn); + } + ovsdb_txn_progress_destroy(t->progress); + t->progress = NULL; + ovsdb_trigger_complete(t); + if (newdb) { + ovsdb_replace(t->db, newdb); + return true; + } return false; } + ovsdb_destroy(newdb); - /* Parse new schema and make a converted copy. */ - const struct json *new_schema_json = params->u.array.elems[1]; - struct ovsdb_schema *new_schema; - struct ovsdb_error *error = ovsdb_schema_from_json(new_schema_json, - &new_schema); - if (!error && strcmp(new_schema->name, t->db->schema->name)) { - error = ovsdb_error( - "invalid parameters", - "new schema name (%s) does not match database name (%s)", - new_schema->name, t->db->schema->name); + /* Fall through to the general handling for the "committing" state. We + * abort the transaction--if and when it eventually commits, we'll read + * it back from storage and replay it locally. */ + if (txn) { + ovsdb_txn_abort(txn); } - if (!error) { - error = ovsdb_file_convert(t->db->file, new_schema); + } + + /* Handle "committing" state. */ + if (t->progress) { + if (!ovsdb_txn_progress_is_complete(t->progress)) { + return false; } - ovsdb_schema_destroy(new_schema); + + /* Transition to "complete". */ + struct ovsdb_error *error + = ovsdb_error_clone(ovsdb_txn_progress_get_error(t->progress)); + ovsdb_txn_progress_destroy(t->progress); + t->progress = NULL; + if (error) { - trigger_error(t, error); - return false; + if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) { + /* Temporary error. Transition back to "initialized" state to + * try again. */ + jsonrpc_msg_destroy(t->reply); + t->reply = NULL; + t->db->run_triggers = true; /* XXX? */ + ovsdb_error_destroy(error); + } else { + /* Permanent error. Transition to "completed" state to report + * it. */ + if (!strcmp(t->request->method, "transact")) { + json_array_add(t->reply->result, + ovsdb_error_to_json_free(error)); + ovsdb_trigger_complete(t); + } else if (!strcmp(t->request->method, "convert")) { + jsonrpc_msg_destroy(t->reply); + t->reply = NULL; + trigger_convert_error(t, error); + } + } + } else { + /* Success. */ + ovsdb_trigger_complete(t); } - trigger_success(t, json_object_create()); - return true; - } else { - OVS_NOT_REACHED(); + return false; } + + OVS_NOT_REACHED(); } static void -ovsdb_trigger_complete(struct ovsdb_trigger *t, struct jsonrpc_msg *reply) +ovsdb_trigger_complete(struct ovsdb_trigger *t) { - ovs_assert(reply && !t->reply); - t->reply = reply; + ovs_assert(t->reply); ovs_list_remove(&t->node); ovs_list_push_back(&t->session->completions, &t->node); } +/* Makes a "convert" request into an error. + * + * This is not suitable for "transact" requests because their replies should + * never be bare ovsdb_errors: RFC 7047 says that their replies must either be + * a JSON-RPC reply that contains an array of operation replies (which can be + * errors), or a JSON-RPC error whose "error" member is simply "canceled". */ static void -trigger_error(struct ovsdb_trigger *t, struct ovsdb_error *error) +trigger_convert_error(struct ovsdb_trigger *t, struct ovsdb_error *error) { - struct jsonrpc_msg *reply = jsonrpc_create_error( + ovs_assert(!strcmp(t->request->method, "convert")); + ovs_assert(error && !t->reply); + t->reply = jsonrpc_create_error( ovsdb_error_to_json_free(error), t->request->id); - ovsdb_trigger_complete(t, reply); + ovsdb_trigger_complete(t); } static void trigger_success(struct ovsdb_trigger *t, struct json *result) { - struct jsonrpc_msg *reply = jsonrpc_create_reply(result, t->request->id); - ovsdb_trigger_complete(t, reply); + ovs_assert(result && !t->reply); + t->reply = jsonrpc_create_reply(result, t->request->id); + ovsdb_trigger_complete(t); } diff --git a/ovsdb/trigger.h b/ovsdb/trigger.h index d9df97f31222..74636baba8b2 100644 --- a/ovsdb/trigger.h +++ b/ovsdb/trigger.h @@ -20,13 +20,35 @@ struct ovsdb; +/* Triggers have the following states: + * + * - Initialized (reply == NULL, progress == NULL): Executing the trigger + * can keep it in the initialized state, if it has a "wait" condition that + * isn't met. Executing the trigger can also yield an error, in which + * case it transition to "complete". Otherwise, execution yields a + * transaction, which the database attempts to commit. If the transaction + * completes immediately and synchronously, then the trigger transitions + * to the "complete" state. If the transaction requires some time to + * complete, it transitions to the "committing" state. + * + * - Committing (reply != NULL, progress != NULL): The transaction is + * committing. If it succeeds, or if it fails permanently, then the + * trigger transitions to "complete". If it fails temporarily + * (e.g. because someone else committed to cluster-based storage before we + * did), then we transition back to "initialized" to try again. + * + * - Complete (reply != NULL, progress == NULL): The transaction is done + * and either succeeded or failed. + */ struct ovsdb_trigger { + /* In "initialized" or "committing" state, in db->triggers. + * In "complete", in session->completions. */ + struct ovs_list node; struct ovsdb_session *session; /* Session that owns this trigger. */ struct ovsdb *db; /* Database on which trigger acts. */ - struct ovs_list node; /* !result: in db->triggers; - * result: in session->completions. */ struct jsonrpc_msg *request; /* Database request. */ struct jsonrpc_msg *reply; /* Result (null if none yet).. */ + struct ovsdb_txn_progress *progress; long long int created; /* Time created. */ long long int timeout_msec; /* Max wait duration. */ bool read_only; /* Database is in read only mode. */ @@ -42,6 +64,7 @@ void ovsdb_trigger_destroy(struct ovsdb_trigger *); bool ovsdb_trigger_is_complete(const struct ovsdb_trigger *); struct jsonrpc_msg *ovsdb_trigger_steal_reply(struct ovsdb_trigger *); +void ovsdb_trigger_cancel(struct ovsdb_trigger *, const char *reason); void ovsdb_trigger_prereplace_db(struct ovsdb_trigger *); diff --git a/tests/.gitignore b/tests/.gitignore index 294e6fb6dafa..3e2ddf2e9e5d 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -34,6 +34,7 @@ /test-ofpbuf /test-ovsdb /test-packets +/test-raft /test-random /test-reconnect /test-rstp diff --git a/tests/automake.mk b/tests/automake.mk index 8157641d94f9..26269a6b1928 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -86,6 +86,7 @@ TESTSUITE_AT = \ tests/ovsdb-idl.at \ tests/ovsdb-lock.at \ tests/ovsdb-rbac.at \ + tests/ovsdb-cluster.at \ tests/ovs-vsctl.at \ tests/ovs-xapi-sync.at \ tests/stp.at \ diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 82df193871b3..67a879b6745b 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -9,6 +9,16 @@ m4_rename([AT_SETUP], [OVS_AT_SETUP]) m4_define([AT_SETUP], [OVS_AT_SETUP($@) ovs_init ]) + +m4_define([OVS_DEFINE_SHELL_HELPERS], + [m4_ifdef([AT_ingroup], [m4_fatal([$0: AT_SETUP and OVS_DEFINE_SHELL_HELPERS may not nest])]) + m4_define([AT_ingroup]) + m4_divert_push([PREPARE_TESTS]) + $1 + m4_divert_pop([PREPARE_TESTS]) + m4_undefine([AT_ingroup])]) + + m4_divert_push([PREPARE_TESTS]) [ # Set ovs_base to the base directory in which the test is running and @@ -204,6 +214,25 @@ wc () { uuidfilt () { $PYTHON "$top_srcdir"/tests/uuidfilt.py "$@" } + +# run_as PROGRAM_NAME COMMAND [ARG...] +# +# Runs a command with argv[0] set to PROGRAM_NAME, if possible, in a +# subshell. Most utilities print argc[0] as part of their messages, +# so this makes it easier to figure out which particular utility +# prints a message if a bunch of identical processes are running. +# +# Not all shells support "exec -a NAME", so test for it. +if (exec -a myname true); then + run_as () { + (exec -a "$@") + } +else + run_as () { + shift + (exec "$@") + } +fi ] m4_divert_pop([PREPARE_TESTS]) diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at new file mode 100644 index 000000000000..3366536043e6 --- /dev/null +++ b/tests/ovsdb-cluster.at @@ -0,0 +1,281 @@ +OVS_DEFINE_SHELL_HELPERS([ +# ovsdb_check_cluster N_SERVERS SCHEMA_FUNC OUTPUT TRANSACTION... +ovsdb_check_cluster () { + local n=$1 schema_func=$2 output=$3 + shift; shift; shift + + $schema_func > schema + schema=`ovsdb-tool schema-name schema` + AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db schema unix:s1.raft], [0], [], [stderr]) + AT_CHECK([grep -v 'from ephemeral to persistent' stderr], [1]) + cid=`ovsdb-tool db-cid s1.db` + for i in `seq 2 $n`; do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema unix:s$i.raft unix:s1.raft]) + done + + on_exit 'kill `cat *.pid`' + for i in `seq $n`; do + AT_CHECK([ovsdb-server -vraft -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) + done + for i in `seq $n`; do + AT_CHECK([ovsdb-client --timeout=30 wait unix:s$i.ovsdb $schema connected]) + done + + for txn + do + AT_CHECK([ovsdb-client --timeout=30 -vjsonrpc -vconsole:off -vsyslog:off -vvlog:off --log-file transact unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb "$txn"], [0], [stdout]) + cat stdout >> output + done + AT_CHECK_UNQUOTED([uuidfilt output], [0], [$output]) + for i in `seq $n`; do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) + done + + AT_CHECK([ovsdb-tool check-cluster s*.db]) +} +]) + +# Test a 1-server cluster. +AT_BANNER([OVSDB - clustered transactions (1 server)]) +m4_define([OVSDB_CHECK_EXECUTION], + [AT_SETUP([$1 - cluster of 1]) + AT_KEYWORDS([ovsdb server positive unix cluster cluster1 $5]) + ovsdb_check_cluster 1 "$2" '$4' m4_foreach([txn], [$3], ['txn' ]) + AT_CLEANUP]) +EXECUTION_EXAMPLES + +# Test a 3-server cluster. +AT_BANNER([OVSDB - clustered transactions (3 servers)]) +m4_define([OVSDB_CHECK_EXECUTION], + [AT_SETUP([$1 - cluster of 3]) + AT_KEYWORDS([ovsdb server positive unix cluster cluster3 $5]) + ovsdb_check_cluster 3 "$2" '$4' m4_foreach([txn], [$3], ['txn' ]) + AT_CLEANUP]) +EXECUTION_EXAMPLES + +# Test a 5-server cluster. +AT_BANNER([OVSDB - clustered transactions (5 servers)]) +m4_define([OVSDB_CHECK_EXECUTION], + [AT_SETUP([$1 - cluster of 5]) + AT_KEYWORDS([ovsdb server positive unix cluster cluster5 $5]) + ovsdb_check_cluster 5 "$2" '$4' m4_foreach([txn], [$3], ['txn' ]) + AT_CLEANUP]) +EXECUTION_EXAMPLES + +AT_BANNER([OVSDB - cluster tests]) + +# Torture test. +OVS_DEFINE_SHELL_HELPERS([ +ovsdb_torture_test () { + local n=$1 # Number of cluster members + local victim=$2 # Cluster member to kill or remove + local variant=$3 # 'kill' and restart or 'remove' and add + cp $top_srcdir/ovn/ovn-sb.ovsschema schema + schema=`ovsdb-tool schema-name schema` + AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db schema unix:s1.raft], [0], [], [dnl +ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral to persistent, including 'status' column in 'Connection' table, because clusters do not support ephemeral columns +]) + + join_cluster() { + local i=$1 + others= + for j in `seq 1 $n`; do + if test $i != $j; then + others="$others unix:s$j.raft" + fi + done + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema unix:s$i.raft $others]) + } + + start_server() { + local i=$1 + printf "\ns$i: starting\n" + AT_CHECK([ovsdb-server -vjsonrpc -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) + } + stop_server() { + local i=$1 + printf "\ns$i: stopping\n" + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) + } + connect_server() { + local i=$1 + printf "\ns$i: waiting to connect to storage\n" + AT_CHECK([ovsdb-client --timeout=30 -vfile -vsyslog:off -vvlog:off --log-file=connect$i.log wait unix:s$i.ovsdb $schema connected]) + } + remove_server() { + local i=$1 + printf "\ns$i: removing from cluster\n" + AT_CHECK([ovs-appctl --timeout=30 -t "`pwd`"/s$i cluster/leave OVN_Southbound]) + printf "\ns$i: waiting for removal to complete\n" + AT_CHECK([ovsdb-client --timeout=30 -vfile -vsyslog:off -vvlog:off --log-file=remove$i.log wait unix:s$i.ovsdb $schema removed]) + stop_server $victim + } + add_server() { + local i=$1 + rm s$i.db + join_cluster $i + start_server $i + connect_server $i + } + + cid=`ovsdb-tool db-cid s1.db` + for i in `seq 2 $n`; do join_cluster $i; done + + on_exit 'kill `cat *.pid`' + for i in `seq $n`; do start_server $i; done + for i in `seq $n`; do connect_server $i; done + + OVN_SB_DB=unix:s1.ovsdb + for i in `seq 2 $n`; do + OVN_SB_DB=$OVN_SB_DB,unix:s$i.ovsdb + done + export OVN_SB_DB + + n1=10 n2=5 + echo "starting $n1*$n2 ovn-sbctl processes..." + for i in $(seq 0 $(expr $n1 - 1) ); do + (for j in $(seq $n2); do + : > $i-$j.running + run_as "ovn-sbctl($i-$j)" ovn-sbctl "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 --no-leader-only add SB_Global . external_ids $i-$j=$i-$j + status=$? + if test $status != 0; then + echo "$i-$j exited with status $status" > $i-$j:$status + fi + rm $i-$j.running + done + : > $i.done)& + done + echo "...done" + sleep 2 + + echo "waiting for ovn-sbctl processes to exit..." + i=0 + phase=0 + while :; do + printf "t=%2d s:" $i + done=0 + for j in $(seq 0 $(expr $n1 - 1)); do + if test -f $j.done; then + printf " $j" + done=$(expr $done + 1) + fi + done + printf '\n' + if test $done = $n1; then + break + fi + + case $phase in # ( + 0) + if test $done -ge $(expr $n1 / 4); then + if test $variant = kill; then + stop_server $victim + else + remove_server $victim + fi + phase=1 + next=$(expr $i + 2) + fi + ;; # ( + 1) + if test $i -ge $next; then + if test $variant = kill; then + start_server $victim + connect_server $victim + else + add_server $victim + fi + phase=2 + fi + ;; + esac + + sleep 1 + i=$(expr $i + 1) + done + echo "...done" + AT_CHECK([if test $phase != 2; then exit 77; fi]) + + for i in `seq 0 9`; do + for j in `seq 5`; do + echo "$i-$j=$i-$j" + done + done > expout + AT_CHECK([ovn-sbctl --timeout=30 --log-file=finalize.log -vtimeval:off -vfile -vsyslog:off --bare get SB_Global . external-ids | sed 's/, /\n/g; s/[[{}""]]//g;'], [0], [expout]) + + for i in `seq $n`; do + if test $i != $victim || test $phase != 1; then + stop_server $i + fi + done + + # We ignore stdout because non-fatal warnings get printed there. + AT_CHECK([ovsdb-tool check-cluster s*.db], [0], [ignore]) +} +]) + +AT_SETUP([OVSDB 3-server torture test - kill/restart leader]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 1 kill +AT_CLEANUP +AT_SETUP([OVSDB 3-server torture test - kill/restart follower 1]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 2 kill +AT_CLEANUP +AT_SETUP([OVSDB 3-server torture test - kill/restart follower 2]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 3 kill +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - kill/restart leader]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 1 kill +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - kill/restart follower 1]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 2 kill +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - kill/restart follower 2]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 3 kill +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - kill/restart follower 3]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 4 kill +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - kill/restart follower 4]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 5 kill +AT_CLEANUP + +AT_SETUP([OVSDB 3-server torture test - remove/re-add leader]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 1 remove +AT_CLEANUP +AT_SETUP([OVSDB 3-server torture test - remove/re-add follower 1]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 2 remove +AT_CLEANUP +AT_SETUP([OVSDB 3-server torture test - remove/re-add follower 2]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster3]) +ovsdb_torture_test 3 3 remove +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - remove/re-add leader]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 1 remove +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - remove/re-add follower 1]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 2 remove +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - remove/re-add follower 2]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 3 remove +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - remove/re-add follower 3]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 4 remove +AT_CLEANUP +AT_SETUP([OVSDB 5-server torture test - remove/re-add follower 4]) +AT_KEYWORDS([ovsdb server positive unix cluster cluster5]) +ovsdb_torture_test 5 5 remove +AT_CLEANUP diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 59b2c1991bde..64559026929c 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -792,7 +792,7 @@ test-ovsdb|ovsdb_idl|link1 table in idltest database lacks l2 column (database n # Check that ovsdb-idl sent on "monitor" request and that it didn't # mention that table or column, and (for paranoia) that it did mention another # table and column. -AT_CHECK([grep -c '"monitor\|monitor_cond"' stderr], [0], [1 +AT_CHECK([grep -c '"monitor\|monitor_cond"' stderr], [0], [2 ]) AT_CHECK([grep '"monitor\|monitor_cond"' stderr | grep link2], [1]) AT_CHECK([grep '"monitor\|monitor_cond"' stderr | grep l2], [1]) diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 917a5cc09ace..a3ef485c6026 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -1,5 +1,39 @@ AT_BANNER([OVSDB -- ovsdb-server monitors]) +OVS_DEFINE_SHELL_HELPERS([ +# ovsdb_check_monitor SCHEMA_FUNC DB TABLE OUTPUT COLUMNS +# PRE-MONITOR-TXN... -- TRANSACTION... +ovsdb_check_monitor () { + local schema_func=$1 db=$2 table=$3 output=$4 columns=$5 + shift; shift; shift; shift; shift + $schema_func > schema + AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) + while test "$1" != "--"; do + AT_CHECK([ovsdb-tool transact db "$1"], [0], [ignore], [ignore]) + shift + done + shift + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db > ovsdb-server.stdout 2> ovsdb-server.stderr], + [0], [], []) + on_exit 'kill `cat ovsdb-server.pid`' + if test "$IS_WIN32" = "yes"; then + AT_CHECK([ovsdb-client -vjsonrpc --pidfile --log-file -d json monitor --format=csv unix:socket $db $table $columns > output 2> ovsdb-client.stderr &], + [0], [ignore], [ignore]) + sleep 1 + else + AT_CHECK([ovsdb-client -vjsonrpc --detach --pidfile --log-file -d json monitor --format=csv unix:socket $db $table $columns > output 2> ovsdb-client.stderr], + [0], [ignore], [ignore]) + fi + on_exit 'kill `cat ovsdb-client.pid`' + for txn in ${1+"$@"} '[["'$db'"]]'; do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) + done + OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid]) + OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) + AT_CHECK_UNQUOTED([$PYTHON $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$output], [ignore]) +} +]) + # OVSDB_CHECK_MONITOR(TITLE, SCHEMA, [PRE-MONITOR-TXN], DB, TABLE, # TRANSACTIONS, OUTPUT, [COLUMNS], [KEYWORDS]) # @@ -16,35 +50,17 @@ AT_BANNER([OVSDB -- ovsdb-server monitors]) # same marker. # # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. -m4_define([OVSDB_CHECK_MONITOR], +m4_define([OVSDB_CHECK_MONITOR], [AT_SETUP([$1]) AT_KEYWORDS([ovsdb server monitor positive $9]) - $2 > schema - AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) - m4_foreach([txn], [$3], - [AT_CHECK([ovsdb-tool transact db 'txn'], [0], [ignore], [ignore])]) - AT_CAPTURE_FILE([ovsdb-server-log]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1], - [0], [], []) - on_exit 'kill `cat ovsdb-server.pid`' - AT_CAPTURE_FILE([ovsdb-client-log]) - if test "$IS_WIN32" = "yes"; then - AT_CHECK([ovsdb-client -vjsonrpc --detach --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output], - [0], [ignore], [ignore]) - sleep 1 - else - AT_CHECK([ovsdb-client -vjsonrpc --detach --no-chdir --pidfile --log-file="`pwd`"/ovsdb-client-log -d json monitor --format=csv unix:socket $4 $5 $8 > output 2>/dev/null], - [0], [ignore], [ignore]) - fi - on_exit 'kill `cat ovsdb-client.pid`' - m4_foreach([txn], [$6], - [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], - [ignore], [ignore])]) - AT_CHECK([ovsdb-client transact unix:socket '[["$4"]]'], [0], - [ignore], [ignore]) - OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid]) - OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) - AT_CHECK([$PYTHON $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$7], [ignore]) + AT_CAPTURE_FILE([ovsdb-server.log]) + AT_CAPTURE_FILE([ovsdb-server.stdout]) + AT_CAPTURE_FILE([ovsdb-server.stderr]) + AT_CAPTURE_FILE([ovsdb-client.log]) + AT_CAPTURE_FILE([ovsdb-client.stderr]) + ovsdb_check_monitor '$2' '$4' '$5' '$7' '$8' \ + m4_foreach([txn], [$3], ['txn' ]) -- \ + m4_foreach([txn], [$6], ['txn' ]) AT_CLEANUP]) # OVSDB_CHECK_MONITOR_COND(TITLE, SCHEMA, [PRE-MONITOR-TXN], DB, TABLE, @@ -69,19 +85,22 @@ m4_define([OVSDB_CHECK_MONITOR_COND], AT_KEYWORDS([ovsdb server monitor monitor-cond positive $10]) $2 > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) - m4_foreach([txn], [$3], - [AT_CHECK([ovsdb-tool transact db 'txn'], [0], [ignore], [ignore])]) + for txn in m4_foreach([txn], [$3], ['txn' ]); do + AT_CHECK([ovsdb-tool transact db "$txn"], [0], [ignore], [ignore]) + done AT_CAPTURE_FILE([ovsdb-server-log]) AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) on_exit 'kill `cat ovsdb-server.pid`' AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket $4 '[$8]' $5 $9 > output], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' - m4_foreach([txn], [$6], - [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], - [ignore], [ignore], [kill `cat server-pid client-pid`])]) - m4_foreach([cond], [$10], - [AT_CHECK([ovs-appctl -t ovsdb-client ovsdb-client/cond_change $5 'cond'], [0], [ignore], [ignore])]) + for txn in m4_foreach([txn], [$6], ['txn' ]); do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], + [ignore], [ignore], [kill `cat server-pid client-pid`]) + done + for cond in m4_foreach([cond], [$10], ['cond' ]); do + AT_CHECK([ovs-appctl -t ovsdb-client ovsdb-client/cond_change $5 "$cond"], [0], [ignore], [ignore]) + done AT_CHECK([ovsdb-client transact unix:socket '[["$4"]]'], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 54ff04ef3146..0fcb4013edec 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -215,7 +215,7 @@ ovs-appctl: ovsdb-server: server returned an error ]) else AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-db db2], 2, [], - [db2: already open + [ovsdb error: db2: already open ovs-appctl: ovsdb-server: server returned an error ]) fi @@ -289,7 +289,7 @@ AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) # Add the second database. constraint_schema > schema2 @@ -322,7 +322,7 @@ AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) constraint_schema > schema2 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2]) # Remove the second database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints]) @@ -466,7 +466,7 @@ AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -497,7 +497,7 @@ AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -v -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -644,53 +644,64 @@ AT_CHECK_UNQUOTED( [ignore], [test ! -e pid || kill `cat pid`]) OVSDB_SERVER_SHUTDOWN AT_CLEANUP - -AT_SETUP([compacting online]) -AT_KEYWORDS([ovsdb server compact]) -ordinal_schema > schema -dnl Make sure that "ovsdb-tool create" works with a dangling symlink for -dnl the database and the lockfile, creating the target of each symlink rather -dnl than replacing the symlinks with regular files. -mkdir dir -if test "$IS_WIN32" = "no"; then - ln -s dir/db db - ln -s dir/.db.~lock~ .db.~lock~ - AT_SKIP_IF([test ! -h db || test ! -h .db.~lock~]) -fi -AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) -dnl Start ovsdb-server. -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server.log db], [0], [ignore], [ignore]) -AT_CAPTURE_FILE([ovsdb-server.log]) -dnl Do a bunch of random transactions that put crap in the database log. -AT_CHECK( - [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do - set -- $pair - ovsdb-client transact unix:socket ' - ["ordinals", - {"op": "insert", - "table": "ordinals", - "row": {"name": "'$1'", "number": '$2'}}, - {"op": "comment", - "comment": "add row for '"$pair"'"}]' - ovsdb-client transact unix:socket ' - ["ordinals", - {"op": "delete", - "table": "ordinals", - "where": [["number", "==", '$2']]}, - {"op": "comment", - "comment": "delete row for '"$2"'"}]' - ovsdb-client transact unix:socket ' - ["ordinals", - {"op": "insert", - "table": "ordinals", - "row": {"name": "'$1'", "number": '$2'}}, - {"op": "comment", - "comment": "add back row for '"$pair"'"}]' - done]], - [0], [stdout], [ignore], [test ! -e pid || kill `cat pid`]) -dnl Check that all the crap is in fact in the database log. -AT_CHECK([[uuidfilt db | grep -v ^OVSDB | sed 's/"_date":[0-9]*/"_date":0/' | ovstest test-json --multiple -]], [0], - [[{"cksum":"12345678 9","name":"ordinals","tables":{"ordinals":{"columns":{"name":{"type":"string"},"number":{"type":"integer"}},"indexes":[["number"]]}},"version":"5.1.3"} + +OVS_DEFINE_SHELL_HELPERS([ +# ovsdb_check_online_compaction MODEL +# +# where MODEL is "standalone" or "cluster" +ovsdb_check_online_compaction() { + local model=$1 + + ordinal_schema > schema + dnl Make sure that "ovsdb-tool create" works with a dangling symlink for + dnl the database and the lockfile, creating the target of each symlink rather + dnl than replacing the symlinks with regular files. + mkdir dir + if test "$IS_WIN32" = "no"; then + ln -s dir/db db + ln -s dir/.db.~lock~ .db.~lock~ + AT_SKIP_IF([test ! -h db || test ! -h .db.~lock~]) + fi + AT_CHECK([if test $model = standalone; then + ovsdb-tool create db schema + else + ovsdb-tool create-cluster db schema unix:s1.raft + fi]) + dnl Start ovsdb-server. + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0]) + AT_CHECK([ovsdb-client wait unix:socket ordinals connected]) + AT_CAPTURE_FILE([ovsdb-server.log]) + dnl Do a bunch of random transactions that put crap in the database log. + AT_CHECK( + [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do + set -- $pair + ovsdb-client transact unix:socket ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"name": "'$1'", "number": '$2'}}, + {"op": "comment", + "comment": "add row for '"$pair"'"}]' + ovsdb-client transact unix:socket ' + ["ordinals", + {"op": "delete", + "table": "ordinals", + "where": [["number", "==", '$2']]}, + {"op": "comment", + "comment": "delete row for '"$2"'"}]' + ovsdb-client transact unix:socket ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"name": "'$1'", "number": '$2'}}, + {"op": "comment", + "comment": "add back row for '"$pair"'"}]' + done]], + [0], [stdout]) + if test $model = standalone; then + dnl Check that all the crap is in fact in the database log. + AT_CHECK([[uuidfilt db | grep -v ^OVSDB | sed 's/"_date":[0-9]*/"_date":0/' | ovstest test-json --multiple -]], [0], +[[{"cksum":"12345678 9","name":"ordinals","tables":{"ordinals":{"columns":{"name":{"type":"string"},"number":{"type":"integer"}},"indexes":[["number"]]}},"version":"5.1.3"} {"_comment":"add row for zero 0","_date":0,"ordinals":{"<0>":{"name":"zero"}}} {"_comment":"delete row for 0","_date":0,"ordinals":{"<0>":null}} {"_comment":"add back row for zero 0","_date":0,"ordinals":{"<1>":{"name":"zero"}}} @@ -709,11 +720,14 @@ AT_CHECK([[uuidfilt db | grep -v ^OVSDB | sed 's/"_date":[0-9]*/"_date":0/' | ov {"_comment":"add row for five 5","_date":0,"ordinals":{"<10>":{"name":"five","number":5}}} {"_comment":"delete row for 5","_date":0,"ordinals":{"<10>":null}} {"_comment":"add back row for five 5","_date":0,"ordinals":{"<11>":{"name":"five","number":5}}} -]], [], [test ! -e pid || kill `cat pid`]) -dnl Dump out and check the actual database contents. -AT_CHECK([[ovsdb-client dump unix:socket ordinals]], - [0], [stdout], [ignore]) -AT_CHECK([uuidfilt stdout], [0], [dnl +]]) + else + dnl Check that at least there's a lot of transactions. + AT_CHECK([test `wc -l < db` -gt 50]) + fi + dnl Dump out and check the actual database contents. + AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout]) + AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table _uuid name number ------------------------------------ ----- ------ @@ -723,33 +737,38 @@ _uuid name number <3> three 3 <4> two 2 <5> zero 0 -], [], [test ! -e pid || kill `cat pid`]) -dnl Now compact the database in-place. -AT_CHECK([[ovs-appctl -t ovsdb-server ovsdb-server/compact]], - [0], [], [ignore], [test ! -e pid || kill `cat pid`]) -dnl Negative test. -AT_CHECK([[ovs-appctl -t ovsdb-server ovsdb-server/compact _Server]], - [2], [], [cannot compact built-in databases +]) + cp db db.pre-compaction + dnl Now compact the database in-place. + AT_CHECK([[ovs-appctl -t ovsdb-server ovsdb-server/compact]], + [0], [], [ignore]) + dnl Negative test. + AT_CHECK([[ovs-appctl -t ovsdb-server ovsdb-server/compact _Server]], + [2], [], [cannot compact built-in databases ovs-appctl: ovsdb-server: server returned an error ]) -dnl Make sure that "db" is still a symlink to dir/db instead of getting -dnl replaced by a regular file, ditto for .db.~lock~. -if test "$IS_WIN32" = "no"; then - AT_CHECK([test -h db]) - AT_CHECK([test -h .db.~lock~]) - AT_CHECK([test -f dir/db]) - AT_CHECK([test -f dir/.db.~lock~]) -fi -dnl We can't fully re-check the contents of the database log, because the -dnl order of the records is not predictable, but there should only be 4 lines -dnl in it now. -AT_CAPTURE_FILE([db]) -AT_CHECK([test `wc -l < db` -eq 4], [0], [], [], - [test ! -e pid || kill `cat pid`]) -dnl And check that the dumped data is the same too: -AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore], - [test ! -e pid || kill `cat pid`]) -AT_CHECK([uuidfilt stdout], [0], [dnl + dnl Make sure that "db" is still a symlink to dir/db instead of getting + dnl replaced by a regular file, ditto for .db.~lock~. + if test "$IS_WIN32" = "no"; then + AT_CHECK([test -h db]) + AT_CHECK([test -h .db.~lock~]) + AT_CHECK([test -f dir/db]) + AT_CHECK([test -f dir/.db.~lock~]) + fi + + # We can't fully re-check the contents of the database log, because the + # order of the records is not predictable, but there should only be 4 lines + # in it now in the standalone case + AT_CAPTURE_FILE([db]) + compacted_lines=`wc -l < db` + echo compacted_lines=$compacted_lines + if test $model = standalone; then + AT_CHECK([test $compacted_lines -eq 4]) + fi + + dnl And check that the dumped data is the same too: + AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout]) + AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table _uuid name number ------------------------------------ ----- ------ @@ -759,27 +778,34 @@ _uuid name number <3> three 3 <4> two 2 <5> zero 0 -], [], [test ! -e pid || kill `cat pid`]) -dnl Now do some more transactions. -AT_CHECK( - [[ovsdb-client transact unix:socket ' - ["ordinals", - {"op": "delete", - "table": "ordinals", - "where": [["number", "<", 3]]}]']], - [0], [[[{"count":3}] -]], [ignore], [test ! -e pid || kill `cat pid`]) -dnl There should be 6 lines in the log now. -AT_CHECK([test `wc -l < db` -eq 6], [0], [], [], - [test ! -e pid || kill `cat pid`]) -dnl Then check that the dumped data is correct. This time first kill -dnl and restart the database server to ensure that the data is correct on -dnl disk as well as in memory. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server.log db], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore], - [test ! -e pid || kill `cat pid`]) -AT_CHECK([uuidfilt stdout], [0], [dnl +]) + dnl Now do some more transactions. + AT_CHECK( + [[ovsdb-client transact unix:socket ' + ["ordinals", + {"op": "delete", + "table": "ordinals", + "where": [["number", "<", 3]]}]']], + [0], [[[{"count":3}] +]], [ignore]) + + dnl There should be 6 lines in the log now, for the standalone case, + dnl and for the clustered case the file should at least have grown. + updated_lines=`wc -l < db` + echo compacted_lines=$compacted_lines updated_lines=$updated_lines + if test $model = standalone; then + AT_CHECK([test $updated_lines -eq 6]) + else + AT_CHECK([test $updated_lines -gt $compacted_lines]) + fi + + dnl Then check that the dumped data is correct. This time first kill + dnl and restart the database server to ensure that the data is correct on + dnl disk as well as in memory. + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db]) + AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout]) + AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table _uuid name number ------------------------------------ ----- ------ @@ -787,43 +813,67 @@ _uuid name number <1> four 4 <2> three 3 ], [], [test ! -e pid || kill `cat pid`]) -OVSDB_SERVER_SHUTDOWN + OVSDB_SERVER_SHUTDOWN +} +]) + +AT_SETUP([compacting online - standalone]) +AT_KEYWORDS([ovsdb server compact]) +ovsdb_check_online_compaction standalone AT_CLEANUP -AT_SETUP([schema conversion online]) -AT_KEYWORDS([ovsdb server convert needs-conversion]) -on_exit 'kill `cat *.pid`' -ordinal_schema > schema -AT_DATA([new-schema], - [[{"name": "ordinals", +AT_SETUP([compacting online - cluster]) +AT_KEYWORDS([ovsdb server compact]) +ovsdb_check_online_compaction cluster +AT_CLEANUP + +OVS_DEFINE_SHELL_HELPERS([ +# ovsdb_check_online_conversion MODEL +# +# where MODEL is "standalone" or "cluster" +ovsdb_check_online_conversion() { + local model=$1 + on_exit 'kill `cat *.pid`' + ordinal_schema > schema + AT_DATA([new-schema], + [[{"name": "ordinals", "tables": { "ordinals": { - "columns": { - "number": {"type": "integer"}}}}} + "columns": { + "number": {"type": "integer"}}}}} ]]) -dnl Make sure that "ovsdb-tool create" works with a dangling symlink for -dnl the database and the lockfile, creating the target of each symlink rather -dnl than replacing the symlinks with regular files. -mkdir dir -if test "$IS_WIN32" = "no"; then - ln -s dir/db db - ln -s dir/.db.~lock~ .db.~lock~ - AT_SKIP_IF([test ! -h db || test ! -h .db.~lock~]) -fi -AT_CHECK([ovsdb-tool create db schema]) -dnl Put some data in the database. -AT_CHECK( - [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do - set -- $pair - ovsdb-tool transact db ' - ["ordinals", - {"op": "insert", - "table": "ordinals", - "row": {"name": "'$1'", "number": '$2'}}, - {"op": "comment", - "comment": "add row for '"$pair"'"}]' - done | uuidfilt]], [0], -[[[{"uuid":["uuid","<0>"]},{}] + dnl Make sure that "ovsdb-tool create" works with a dangling symlink for + dnl the database and the lockfile, creating the target of each symlink + dnl rather than replacing the symlinks with regular files. + mkdir dir + if test "$IS_WIN32" = "no"; then + ln -s dir/db db + ln -s dir/.db.~lock~ .db.~lock~ + AT_SKIP_IF([test ! -h db || test ! -h .db.~lock~]) + fi + AT_CHECK([if test $model = standalone; then + ovsdb-tool create db schema + else + ovsdb-tool create-cluster db schema unix:s1.raft + fi]) + + dnl Start the database server. + AT_CHECK([ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0]) + AT_CAPTURE_FILE([ovsdb-server.log]) + + dnl Put some data in the database. + AT_CHECK( + [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do + set -- $pair + ovsdb-client transact ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"name": "'$1'", "number": '$2'}}, + {"op": "comment", + "comment": "add row for '"$pair"'"}]' + done | uuidfilt]], [0], + [[[{"uuid":["uuid","<0>"]},{}] [{"uuid":["uuid","<1>"]},{}] [{"uuid":["uuid","<2>"]},{}] [{"uuid":["uuid","<3>"]},{}] @@ -831,81 +881,77 @@ AT_CHECK( [{"uuid":["uuid","<5>"]},{}] ]], [ignore]) -dnl Start the database server. -AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0]) -AT_CAPTURE_FILE([ovsdb-server.log]) - -dnl Try "needs-conversion". -AT_CHECK([ovsdb-client needs-conversion schema], [0], [no + dnl Try "needs-conversion". + AT_CHECK([ovsdb-client needs-conversion schema], [0], [no ]) -AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [yes + AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [yes ]) -dnl Start two monitors on the 'ordinals' db, one that is database -dnl change aware and one that is not. -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-aware.pid --log-file=monitor-ordinals-aware.log --db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-aware.stdout 2> monitor-ordinals-aware.stderr]) -AT_CAPTURE_FILE([monitor-ordinals-aware.stdout]) -AT_CAPTURE_FILE([monitor-ordinals-aware.log]) -AT_CAPTURE_FILE([monitor-ordinals-aware.stderr]) - -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-unaware.pid --log-file=monitor-ordinals-unaware.log --no-db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-unaware.stdout 2> monitor-ordinals-unaware.stderr]) -AT_CAPTURE_FILE([monitor-ordinals-unaware.stdout]) -AT_CAPTURE_FILE([monitor-ordinals-unaware.log]) -AT_CAPTURE_FILE([monitor-ordinals-unaware.stderr]) - -dnl Start two monitors on the '_Server' db, one that is database -dnl change aware and one that is not. -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-aware.pid --log-file=monitor-server-aware.log --db-change-aware --no-headings monitor _Server Database name > monitor-server-aware.stdout 2> monitor-server-aware.stderr]) -AT_CAPTURE_FILE([monitor-server-aware.stdout]) -AT_CAPTURE_FILE([monitor-server-aware.log]) -AT_CAPTURE_FILE([monitor-server-aware.stderr]) - -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-unaware.pid --log-file=monitor-server-unaware.log --no-db-change-aware --no-headings monitor _Server Database name > monitor-server-unaware.stdout 2> monitor-server-unaware.stderr]) -AT_CAPTURE_FILE([monitor-server-unaware.stdout]) -AT_CAPTURE_FILE([monitor-server-unaware.log]) -AT_CAPTURE_FILE([monitor-server-unaware.stderr]) - -dnl Start two long-running transactions (triggers) on the 'ordinals' db, -dnl one that is database change aware and one that is not. -ordinals_txn='[["ordinals", - {"op": "wait", - "table": "ordinals", - "where": [["name", "==", "seven"]], - "columns": ["name", "number"], - "rows": [], - "until": "!="}]]' -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-aware.pid --log-file=trigger-ordinals-aware.log --db-change-aware transact "$ordinals_txn" > trigger-ordinals-aware.stdout 2> trigger-ordinals-aware.stderr]) -AT_CAPTURE_FILE([trigger-ordinals-aware.stdout]) -AT_CAPTURE_FILE([trigger-ordinals-aware.log]) -AT_CAPTURE_FILE([trigger-ordinals-aware.stderr]) - -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-unaware.pid --log-file=trigger-ordinals-unaware.log --no-db-change-aware transact "$ordinals_txn" > trigger-ordinals-unaware.stdout 2> trigger-ordinals-unaware.stderr]) -AT_CAPTURE_FILE([trigger-ordinals-unaware.stdout]) -AT_CAPTURE_FILE([trigger-ordinals-unaware.log]) -AT_CAPTURE_FILE([trigger-ordinals-unaware.stderr]) - -dnl Start two long-running transactions (triggers) on the _Server db, -dnl one that is database change aware and one that is not. -server_txn='[["_Server", - {"op": "wait", - "table": "Database", - "where": [["name", "==", "xyzzy"]], - "columns": ["name"], - "rows": [], - "until": "!="}]]' -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-aware.pid --log-file=trigger-server-aware.log --db-change-aware transact "$server_txn" > trigger-server-aware.stdout 2> trigger-server-aware.stderr]) -AT_CAPTURE_FILE([trigger-server-aware.stdout]) -AT_CAPTURE_FILE([trigger-server-aware.log]) -AT_CAPTURE_FILE([trigger-server-aware.stderr]) - -AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-unaware.pid --log-file=trigger-server-unaware.log --no-db-change-aware transact "$server_txn" > trigger-server-unaware.stdout 2> trigger-server-unaware.stderr]) -AT_CAPTURE_FILE([trigger-server-unaware.stdout]) -AT_CAPTURE_FILE([trigger-server-unaware.log]) -AT_CAPTURE_FILE([trigger-server-unaware.stderr]) - -dnl Dump out and check the actual database contents. -AT_CHECK([ovsdb-client dump unix:db.sock ordinals], [0], [stdout]) -AT_CHECK([uuidfilt stdout], [0], [dnl + dnl Start two monitors on the 'ordinals' db, one that is database + dnl change aware and one that is not. + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-aware.pid --log-file=monitor-ordinals-aware.log --db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-aware.stdout 2> monitor-ordinals-aware.stderr]) + AT_CAPTURE_FILE([monitor-ordinals-aware.stdout]) + AT_CAPTURE_FILE([monitor-ordinals-aware.log]) + AT_CAPTURE_FILE([monitor-ordinals-aware.stderr]) + + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-ordinals-unaware.pid --log-file=monitor-ordinals-unaware.log --no-db-change-aware --no-headings monitor ordinals ordinals number name > monitor-ordinals-unaware.stdout 2> monitor-ordinals-unaware.stderr]) + AT_CAPTURE_FILE([monitor-ordinals-unaware.stdout]) + AT_CAPTURE_FILE([monitor-ordinals-unaware.log]) + AT_CAPTURE_FILE([monitor-ordinals-unaware.stderr]) + + dnl Start two monitors on the '_Server' db, one that is database + dnl change aware and one that is not. + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-aware.pid --log-file=monitor-server-aware.log --db-change-aware --no-headings monitor _Server Database name > monitor-server-aware.stdout 2> monitor-server-aware.stderr]) + AT_CAPTURE_FILE([monitor-server-aware.stdout]) + AT_CAPTURE_FILE([monitor-server-aware.log]) + AT_CAPTURE_FILE([monitor-server-aware.stderr]) + + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=monitor-server-unaware.pid --log-file=monitor-server-unaware.log --no-db-change-aware --no-headings monitor _Server Database name > monitor-server-unaware.stdout 2> monitor-server-unaware.stderr]) + AT_CAPTURE_FILE([monitor-server-unaware.stdout]) + AT_CAPTURE_FILE([monitor-server-unaware.log]) + AT_CAPTURE_FILE([monitor-server-unaware.stderr]) + + dnl Start two long-running transactions (triggers) on the 'ordinals' db, + dnl one that is database change aware and one that is not. + ordinals_txn='[["ordinals", + {"op": "wait", + "table": "ordinals", + "where": [["name", "==", "seven"]], + "columns": ["name", "number"], + "rows": [], + "until": "!="}]]' + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-aware.pid --log-file=trigger-ordinals-aware.log --db-change-aware transact "$ordinals_txn" > trigger-ordinals-aware.stdout 2> trigger-ordinals-aware.stderr]) + AT_CAPTURE_FILE([trigger-ordinals-aware.stdout]) + AT_CAPTURE_FILE([trigger-ordinals-aware.log]) + AT_CAPTURE_FILE([trigger-ordinals-aware.stderr]) + + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-ordinals-unaware.pid --log-file=trigger-ordinals-unaware.log --no-db-change-aware transact "$ordinals_txn" > trigger-ordinals-unaware.stdout 2> trigger-ordinals-unaware.stderr]) + AT_CAPTURE_FILE([trigger-ordinals-unaware.stdout]) + AT_CAPTURE_FILE([trigger-ordinals-unaware.log]) + AT_CAPTURE_FILE([trigger-ordinals-unaware.stderr]) + + dnl Start two long-running transactions (triggers) on the _Server db, + dnl one that is database change aware and one that is not. + server_txn='[["_Server", + {"op": "wait", + "table": "Database", + "where": [["name", "==", "xyzzy"]], + "columns": ["name"], + "rows": [], + "until": "!="}]]' + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-aware.pid --log-file=trigger-server-aware.log --db-change-aware transact "$server_txn" > trigger-server-aware.stdout 2> trigger-server-aware.stderr]) + AT_CAPTURE_FILE([trigger-server-aware.stdout]) + AT_CAPTURE_FILE([trigger-server-aware.log]) + AT_CAPTURE_FILE([trigger-server-aware.stderr]) + + AT_CHECK([ovsdb-client -vfile -vvlog:off --detach --pidfile=trigger-server-unaware.pid --log-file=trigger-server-unaware.log --no-db-change-aware transact "$server_txn" > trigger-server-unaware.stdout 2> trigger-server-unaware.stderr]) + AT_CAPTURE_FILE([trigger-server-unaware.stdout]) + AT_CAPTURE_FILE([trigger-server-unaware.log]) + AT_CAPTURE_FILE([trigger-server-unaware.stderr]) + + dnl Dump out and check the actual database contents. + AT_CHECK([ovsdb-client dump unix:db.sock ordinals], [0], [stdout]) + AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table _uuid name number ------------------------------------ ----- ------ @@ -917,76 +963,77 @@ _uuid name number <5> zero 0 ]) -dnl Convert the database. -AT_CHECK([ovsdb-client convert new-schema]) + dnl Convert the database. + AT_CHECK([ovsdb-client convert new-schema]) -dnl Try "needs-conversion". -AT_CHECK([ovsdb-client needs-conversion schema], [0], [yes + dnl Try "needs-conversion". + AT_CHECK([ovsdb-client needs-conversion schema], [0], [yes ]) -AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [no + AT_CHECK([ovsdb-client needs-conversion new-schema], [0], [no ]) -dnl Verify that the "ordinals" monitors behaved as they should have. -dnl Both should have exited, for different reasons. -dnl The db-aware _Server monitor should still be running, but not the unaware -dnl one. -for x in unaware aware; do - OVS_WAIT_WHILE([test -e monitor-ordinals-$x.pid]) - AT_CHECK([sort -k 3 monitor-ordinals-$x.stdout | uuidfilt], [0], -[<0> initial 0 zero + dnl Verify that the "ordinals" monitors behaved as they should have. + dnl Both should have exited, for different reasons. + for x in aware unaware; do + echo $x + OVS_WAIT_WHILE([test -e monitor-ordinals-$x.pid]) + AT_CHECK([sort -k 3 monitor-ordinals-$x.stdout | uuidfilt], [0], + [<0> initial 0 zero <1> initial 1 one <2> initial 2 two <3> initial 3 three <4> initial 4 four <5> initial 5 five ]) -done -AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) + done + AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) ]) -AT_CHECK([sed 's/.*: //' monitor-ordinals-aware.stderr], [0], [ordinals database was removed + AT_CHECK([sed 's/.*: //' monitor-ordinals-aware.stderr], [0], [ordinals database was removed ]) -dnl Verify that the _Server monitors behaved as they should have. -dnl The db-aware monitor should still be running, but not the unaware one. -for x in aware unaware; do - AT_CHECK([sort -k 3 monitor-server-$x.stdout | uuidfilt], [0], -[<0> initial _Server + dnl Verify that the _Server monitors behaved as they should have. + dnl The db-aware monitor should still be running, but not the unaware one. + for x in aware unaware; do + AT_CHECK([sort -k 3 monitor-server-$x.stdout | uuidfilt], [0], + [<0> initial _Server <1> initial ordinals ]) -done -OVS_WAIT_WHILE([test -e monitor-server-unaware.pid]) -AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) + done + OVS_WAIT_WHILE([test -e monitor-server-unaware.pid]) + AT_CHECK([sed 's/.*: //' monitor-ordinals-unaware.stderr], [0], [receive failed (End of file) ]) -AT_CHECK([test -e monitor-server-aware.pid]) + AT_CHECK([test -e monitor-server-aware.pid]) -dnl Verify that the "ordinals" triggers behaved as they should have: -dnl Both should have exited, for different reasons. -for x in unaware aware; do - OVS_WAIT_WHILE([test -e trigger-ordinals-$x.pid]) - AT_CHECK([cat trigger-ordinals-$x.stdout]) -done -AT_CHECK([cat trigger-ordinals-unaware.stderr], [0], [ovsdb-client: transaction failed (End of file) + dnl Verify that the "ordinals" triggers behaved as they should have: + dnl Both should have exited, for different reasons. + for x in unaware aware; do + OVS_WAIT_WHILE([test -e trigger-ordinals-$x.pid]) + AT_CHECK([cat trigger-ordinals-$x.stdout]) + done + AT_CHECK([cat trigger-ordinals-unaware.stderr], [0], [ovsdb-client: transaction failed (End of file) ]) -AT_CHECK([cat trigger-ordinals-aware.stderr], [0], [ovsdb-client: transaction returned error: {"error":"canceled"} + AT_CHECK([cat trigger-ordinals-aware.stderr], [0], [ovsdb-client: transaction returned error: {"details":"transaction canceled due to database schema change","error":"canceled"} ]) -dnl Verify that the _Server triggers behaved as they should have: -dnl The db-aware trigger should still be waiting, but not the unaware one. -for x in aware unaware; do - AT_CHECK([cat trigger-server-$x.stdout]) -done -OVS_WAIT_WHILE([test -e trigger-server-unaware.pid]) -AT_CHECK([sed 's/.*: //' trigger-ordinals-unaware.stderr], [0], [transaction failed (End of file) + dnl Verify that the _Server triggers behaved as they should have: + dnl The db-aware trigger should still be waiting, but not the unaware one. + for x in aware unaware; do + AT_CHECK([cat trigger-server-$x.stdout]) + done + OVS_WAIT_WHILE([test -e trigger-server-unaware.pid]) + AT_CHECK([sed 's/.*: //' trigger-ordinals-unaware.stderr], [0], [transaction failed (End of file) ]) -AT_CHECK([test -e trigger-server-aware.pid]) - -dnl We can't fully re-check the contents of the database log, because the -dnl order of the records is not predictable, but there should only be 4 lines -dnl in it now. -AT_CAPTURE_FILE([db]) -AT_CHECK([test `wc -l < db` -eq 4]) -dnl And check that the dumped data is the same except for the removed column: -AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl + AT_CHECK([test -e trigger-server-aware.pid]) + + AT_CAPTURE_FILE([db]) + if test $model = standalone; then + dnl We can't fully re-check the contents of the database log, because the + dnl order of the records is not predictable, but there should only be 4 lines + dnl in it now. + AT_CHECK([test `wc -l < db` -eq 4]) + fi + dnl Check that the dumped data is the same except for the removed column: + AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl ordinals table _uuid number ------------------------------------ ------ @@ -997,21 +1044,23 @@ _uuid number <4> 4 <5> 5 ]) -dnl Now check that the converted database is still online and can be modified, -dnl then check that the database log has one more record and that the data -dnl is as expected. -AT_CHECK( - [[ovsdb-client transact ' - ["ordinals", - {"op": "insert", - "table": "ordinals", - "row": {"number": 6}}, - {"op": "comment", - "comment": "add row for 6"}]' | uuidfilt]], [0], - [[[{"uuid":["uuid","<0>"]},{}] + dnl Now check that the converted database is still online and can be modified, + dnl then check that the database log has one more record and that the data + dnl is as expected. + AT_CHECK( + [[ovsdb-client transact ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 6}}, + {"op": "comment", + "comment": "add row for 6"}]' | uuidfilt]], [0], + [[[{"uuid":["uuid","<0>"]},{}] ]]) -AT_CHECK([test `wc -l < db` -eq 6]) -AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl + if test $model = standalone; then + AT_CHECK([test `wc -l < db` -eq 6]) + fi + AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl ordinals table _uuid number ------------------------------------ ------ @@ -1023,12 +1072,12 @@ _uuid number <5> 5 <6> 6 ]) -dnl Now kill and restart the database server to ensure that the data is -dnl correct on disk as well as in memory. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) -AT_CHECK([[ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db]], - [0]) -AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl + dnl Now kill and restart the database server to ensure that the data is + dnl correct on disk as well as in memory. + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + AT_CHECK([[ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db]], + [0]) + AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl ordinals table _uuid number ------------------------------------ ------ @@ -1041,18 +1090,29 @@ _uuid number <6> 6 ]) -dnl Make sure that "db" is still a symlink to dir/db instead of getting -dnl replaced by a regular file, ditto for .db.~lock~. -if test "$IS_WIN32" = "no"; then - AT_CHECK([test -h db]) - AT_CHECK([test -h .db.~lock~]) - AT_CHECK([test -f dir/db]) - AT_CHECK([test -f dir/.db.~lock~]) -fi + dnl Make sure that "db" is still a symlink to dir/db instead of getting + dnl replaced by a regular file, ditto for .db.~lock~. + if test "$IS_WIN32" = "no"; then + AT_CHECK([test -h db]) + AT_CHECK([test -h .db.~lock~]) + AT_CHECK([test -f dir/db]) + AT_CHECK([test -f dir/.db.~lock~]) + fi -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +} +]) + +AT_SETUP([schema conversion online - standalone]) +AT_KEYWORDS([ovsdb server convert needs-conversion standalone]) +ovsdb_check_online_conversion standalone AT_CLEANUP +AT_SETUP([schema conversion online - clustered]) +AT_KEYWORDS([ovsdb server convert needs-conversion cluster]) +ovsdb_check_online_conversion cluster +AT_CLEANUP + AT_SETUP([ovsdb-server combines updates on backlogged connections]) on_exit 'kill `cat *.pid`' diff --git a/tests/ovsdb-tool.at b/tests/ovsdb-tool.at index 1409a80c4b1f..ab766be248ea 100644 --- a/tests/ovsdb-tool.at +++ b/tests/ovsdb-tool.at @@ -302,6 +302,22 @@ _uuid name number OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP +AT_SETUP([ovsdb-tool unsupported cluster operations]) +AT_KEYWORDS([ovsdb file negative compact query transact convert]) +ordinal_schema > schema +AT_CHECK([ovsdb-tool create-cluster db schema unix:s1.raft]) +AT_CHECK([ovsdb-tool compact db], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool convert db schema], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool needs-conversion db schema], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool query db '[[]]'], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool transact db '[[]]'], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CLEANUP + AT_SETUP([ovsdb-tool schema-version, schema-cksum, schema-name]) AT_KEYWORDS([ovsdb file positive schema-version schema-cksum]) ordinal_schema > schema @@ -313,8 +329,8 @@ AT_CHECK([ovsdb-tool schema-name schema], [0], [ordinals ]) AT_CLEANUP -AT_SETUP([ovsdb-tool db-version, db-cksum, db-name]) -AT_KEYWORDS([ovsdb file positive db-version db-cksum]) +AT_SETUP([ovsdb-tool database inspection commands - standalone]) +AT_KEYWORDS([ovsdb file positive db-version db-cksum db-name db-cid db-sid db-local-address]) ordinal_schema > schema touch .db.~lock~ AT_CHECK([ovsdb-tool create db schema], [0], [], [ignore]) @@ -324,6 +340,61 @@ AT_CHECK([ovsdb-tool db-cksum db], [0], [12345678 9 ]) AT_CHECK([ovsdb-tool db-name db], [0], [ordinals ]) +AT_CHECK([ovsdb-tool db-cid db], [1], [], [ovsdb-tool: db: not a clustered database +]) +AT_CHECK([ovsdb-tool db-sid db], [1], [], [ovsdb-tool: db: not a clustered database +]) +AT_CHECK([ovsdb-tool db-local-address db], [1], [], [ovsdb-tool: db: not a clustered database +]) +AT_CLEANUP + +AT_SETUP([ovsdb-tool database inspection commands - clustered]) +AT_KEYWORDS([ovsdb file negative db-version db-cksum db-name db-cid db-sid db-local-address cluster]) +ordinal_schema > schema +touch .db.~lock~ +AT_CHECK([ovsdb-tool create-cluster db schema tcp:1.2.3.4:1234]) +AT_CHECK([ovsdb-tool db-version db], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool db-cksum db], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) +AT_CHECK([ovsdb-tool db-name db], [0], [ordinals +]) +AT_CHECK([(ovsdb-tool db-cid db; ovsdb-tool db-sid db) | uuidfilt], [0], [<0> +<1> +]) +AT_CHECK([ovsdb-tool db-local-address db], [0], [tcp:1.2.3.4:1234 +]) +AT_CLEANUP + +AT_SETUP([ovsdb-tool database inspection commands - joining a cluster]) +AT_KEYWORDS([ovsdb file positive db-version db-cksum db-name db-cid db-sid db-local-address cluster join joining]) +ordinal_schema > schema +touch .db.~lock~ +for cid in '' 520cf525-3772-43cc-8268-23bf5b548cf4; do + if test -z "$cid"; then + cid_option= + else + cid_option=--cid=$cid + fi + AT_CHECK([rm -f db && ovsdb-tool $cid_option join-cluster db ordinals tcp:1.2.3.4:1234 tcp:2.3.4.5:1234], [0], [], [ignore]) + AT_CHECK([ovsdb-tool db-version db], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) + AT_CHECK([ovsdb-tool db-cksum db], [1], [], [ovsdb-tool: ovsdb error: db: cannot apply this operation to clustered database file +]) + AT_CHECK([ovsdb-tool db-name db], [0], [ordinals +]) + if test -z "$cid"; then + AT_CHECK([ovsdb-tool db-cid db], [2], [], [db: cluster ID not yet known +]) + else + AT_CHECK_UNQUOTED([ovsdb-tool db-cid db], [0], [$cid +]) + fi + AT_CHECK([ovsdb-tool db-sid db | uuidfilt], [0], [<0> +]) + AT_CHECK([ovsdb-tool db-local-address db], [0], [tcp:1.2.3.4:1234 +]) +done AT_CLEANUP AT_SETUP([ovsdb-tool needs-conversion (no conversion needed)]) @@ -345,3 +416,46 @@ AT_CHECK([diff schema schema2], [1], [ignore]) AT_CHECK([ovsdb-tool needs-conversion db schema2], [0], [yes ]) AT_CLEANUP + +AT_SETUP([ovsdb-tool create-cluster with initial data]) +AT_KEYWORDS([ovsdb file positive]) + +# Create a standalone database and put some data in it. +ordinal_schema > schema +ovsdb-tool create db1 schema +AT_CHECK( + [[for pair in 'zero 0' 'one 1' 'two 2' 'three 3' 'four 4' 'five 5'; do + set -- $pair + ovsdb-tool transact db1 ' + ["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"name": "'$1'", "number": '$2'}}, + {"op": "comment", + "comment": "add row for '"$pair"'"}]' + done | uuidfilt]], [0], +[[[{"uuid":["uuid","<0>"]},{}] +[{"uuid":["uuid","<1>"]},{}] +[{"uuid":["uuid","<2>"]},{}] +[{"uuid":["uuid","<3>"]},{}] +[{"uuid":["uuid","<4>"]},{}] +[{"uuid":["uuid","<5>"]},{}] +]], [ignore]) + +# Dump the data. +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb-client dump > expout]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + +# Create a clustered database from the standalone one. +ovsdb-tool create-cluster db2 db1 unix:s1.raft + +# Dump the data. +AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db2]) +AT_CHECK([ovsdb-client wait ordinals connected]) +AT_CHECK([ovsdb-client dump > dump2]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + +# Make sure that the clustered data matched the standalone data. +AT_CHECK([cat dump2], [0], [expout]) +AT_CLEANUP diff --git a/tests/ovsdb.at b/tests/ovsdb.at index a38abd858272..f109b79b60a1 100644 --- a/tests/ovsdb.at +++ b/tests/ovsdb.at @@ -150,3 +150,4 @@ m4_include([tests/ovsdb-monitor.at]) m4_include([tests/ovsdb-idl.at]) m4_include([tests/ovsdb-lock.at]) m4_include([tests/ovsdb-rbac.at]) +m4_include([tests/ovsdb-cluster.at]) diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 8502ad73ff69..05e97cb5132c 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -40,6 +40,7 @@ #include "ovsdb/query.h" #include "ovsdb/row.h" #include "ovsdb/server.h" +#include "ovsdb/storage.h" #include "ovsdb/table.h" #include "ovsdb/transaction.h" #include "ovsdb/trigger.h" @@ -1484,7 +1485,7 @@ do_execute__(struct ovs_cmdl_context *ctx, bool ro) json = parse_json(ctx->argv[1]); check_ovsdb_error(ovsdb_schema_from_json(json, &schema)); json_destroy(json); - db = ovsdb_create(schema); + db = ovsdb_create(schema, ovsdb_storage_create_unbacked()); for (i = 2; i < ctx->argc; i++) { struct json *params, *result; @@ -1550,7 +1551,7 @@ do_trigger(struct ovs_cmdl_context *ctx) json = parse_json(ctx->argv[1]); check_ovsdb_error(ovsdb_schema_from_json(json, &schema)); json_destroy(json); - db = ovsdb_create(schema); + db = ovsdb_create(schema, ovsdb_storage_create_unbacked()); ovsdb_server_init(&server); ovsdb_server_add_db(&server, db); @@ -1613,7 +1614,7 @@ static struct ovsdb_table *do_transact_table; static void do_transact_commit(struct ovs_cmdl_context *ctx OVS_UNUSED) { - ovsdb_error_destroy(ovsdb_txn_commit(do_transact_txn, false)); + ovsdb_error_destroy(ovsdb_txn_replay_commit(do_transact_txn)); do_transact_txn = NULL; } @@ -1780,7 +1781,7 @@ do_transact(struct ovs_cmdl_context *ctx) " \"j\": {\"type\": \"integer\"}}}}}"); check_ovsdb_error(ovsdb_schema_from_json(json, &schema)); json_destroy(json); - do_transact_db = ovsdb_create(schema); + do_transact_db = ovsdb_create(schema, ovsdb_storage_create_unbacked()); do_transact_table = ovsdb_get_table(do_transact_db, "mytable"); ovs_assert(do_transact_table != NULL); diff --git a/tutorial/ovs-sandbox b/tutorial/ovs-sandbox index 1632ad15da5a..f3aeafd55ec7 100755 --- a/tutorial/ovs-sandbox +++ b/tutorial/ovs-sandbox @@ -17,6 +17,7 @@ set -e run() { + echo "$@" (cd "$sandbox" && "$@") || exit 1 } @@ -70,6 +71,10 @@ ovn=false ovnsb_schema= ovnnb_schema= ovn_rbac=true +nbdb_model=standalone +nbdb_servers=3 +sbdb_model=backup +sbdb_servers=3 dummy=override for option; do @@ -109,6 +114,8 @@ These options force ovs-sandbox to use a particular OVS build: -s, --srcdir=DIR specify Open vSwitch source directory These options force ovs-sandbox to use an installed Open vSwitch: -i, --installed use installed Open vSwitch + +General options: -g, --gdb-vswitchd run ovs-vswitchd under gdb -d, --gdb-ovsdb run ovsdb-server under gdb --gdb-ovn-northd run ovn-northd under gdb @@ -118,8 +125,14 @@ These options force ovs-sandbox to use an installed Open vSwitch: -R, --gdb-run automatically start running the daemon in gdb for any daemon set to run under gdb -S, --schema=FILE use FILE as vswitch.ovsschema + +OVN options: -o, --ovn enable OVN --no-ovn-rbac disable role-based access control for OVN + --nbdb-model=standalone|backup|clustered northbound database model + --nbdb-servers=N number of servers in nbdb cluster (default: 3) + --sbdb-model=standalone|backup|clustered southbound database model + --sbdb-servers=N number of servers in sbdb cluster (default: 3) Other options: -h, --help Print this usage message. @@ -191,6 +204,34 @@ EOF --no-ovn-rbac) ovn_rbac=false ;; + --nbdb-s*=*) + nbdb_servers=$optarg + nbdb_model=clustered + ;; + --nbdb-s*) + prev=nbdb_servers + nbdb_model=clustered + ;; + --nbdb-m*=*) + nbdb_model=$optarg + ;; + --nbdb-m*) + prev=nbdb_model + ;; + --sbdb-s*=*) + sbdb_servers=$optarg + sbdb_model=clustered + ;; + --sbdb-s*) + prev=sbdb_servers + sbdb_model=clustered + ;; + --sbdb-m*=*) + sbdb_model=$optarg + ;; + --sbdb-m*) + prev=sbdb_model + ;; -R|--gdb-run) gdb_vswitchd_ex=true gdb_ovsdb_ex=true @@ -326,15 +367,10 @@ touch "$sandbox"/.conf.db.~lock~ run ovsdb-tool create conf.db "$schema" ovsdb_server_args= if $ovn; then - touch "$sandbox"/.ovnsb.db.~lock~ touch "$sandbox"/.ovnnb.db.~lock~ - run ovsdb-tool create ovnsb.db "$ovnsb_schema" - run ovsdb-tool create ovnsb2.db "$ovnsb_schema" run ovsdb-tool create ovnnb.db "$ovnnb_schema" run ovsdb-tool create vtep.db "$vtep_schema" ovsdb_server_args="vtep.db conf.db" - ovsdb_sb_server_args="ovnsb.db" - ovsdb_sb_backup_server_args="ovnsb2.db" ovsdb_nb_server_args="ovnnb.db" if [ "$HAVE_OPENSSL" = yes ]; then @@ -348,36 +384,85 @@ fi rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir --pidfile -vconsole:off --log-file \ --remote=punix:"$sandbox"/db.sock $ovsdb_server_args if $ovn; then - rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir \ - --pidfile="$sandbox"/ovnnb_db.pid -vconsole:off \ - --log-file="$sandbox"/ovnnb_db.log \ - --remote=db:OVN_Northbound,NB_Global,connections \ - --private-key=db:OVN_Northbound,SSL,private_key \ - --certificate=db:OVN_Northbound,SSL,certificate \ - --ca-cert=db:OVN_Northbound,SSL,ca_cert \ - --ssl-protocols=db:OVN_Northbound,SSL,ssl_protocols \ - --ssl-ciphers=db:OVN_Northbound,SSL,ssl_ciphers \ - --remote=punix:"$sandbox"/ovnnb_db.sock $ovsdb_nb_server_args - rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir \ - --pidfile="$sandbox"/ovnsb_db.pid -vconsole:off \ - --log-file="$sandbox"/ovnsb_db.log \ - --remote=db:OVN_Southbound,SB_Global,connections \ - --private-key=db:OVN_Southbound,SSL,private_key \ - --certificate=db:OVN_Southbound,SSL,certificate \ - --ca-cert=db:OVN_Southbound,SSL,ca_cert \ - --ssl-protocols=db:OVN_Southbound,SSL,ssl_protocols \ - --ssl-ciphers=db:OVN_Southbound,SSL,ssl_ciphers \ - --remote=punix:"$sandbox"/ovnsb_db.sock $ovsdb_sb_server_args - # Start SB back up server - rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir \ - --pidfile="$sandbox"/ovnsb_db2.pid -vconsole:off \ - --log-file="$sandbox"/ovnsb_db2.log \ - --private-key=db:OVN_Southbound,SSL,private_key \ - --certificate=db:OVN_Southbound,SSL,certificate \ - --ca-cert=db:OVN_Southbound,SSL,ca_cert \ - --remote=punix:"$sandbox"/ovnsb_db2.sock \ - --unixctl="$sandbox"/sb_backup_unixctl \ - --sync-from=unix:"$sandbox"/ovnsb_db.sock $ovsdb_sb_backup_server_args + ovn_start_db() { + local db=$1 model=$2 servers=$3 schema=$4 + local DB=$(echo $db | tr a-z A-Z) + local schema_name=$(ovsdb-tool schema-name $schema) + + case $model in + standalone | backup) ;; + clustered) + case $servers in + [1-9] | [1-9][0-9]) ;; + *) echo "${db}db servers must be between 1 and 99" >&2 + exit 1 + ;; + esac + ;; + *) + echo "unknown ${db}db model \"$model\"" >&2 + exit 1 + ;; + esac + + ovn_start_ovsdb_server() { + local i=$1; shift + rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir \ + --pidfile=$db$i.pid -vconsole:off --log-file=$db$i.log \ + --remote=db:$schema_name,${DB}_Global,connections \ + --private-key=db:$schema_name,SSL,private_key \ + --certificate=db:$schema_name,SSL,certificate \ + --ca-cert=db:$schema_name,SSL,ca_cert \ + --ssl-protocols=db:$schema_name,SSL,ssl_protocols \ + --ssl-ciphers=db:$schema_name,SSL,ssl_ciphers \ + --unixctl=${db}$i --remote=punix:$db$i.ovsdb ${db}$i.db "$@" + } + + case $model in + standalone) + run ovsdb-tool create ${db}1.db "$schema" + ovn_start_ovsdb_server 1 + remote=unix:${db}1.ovsdb + ;; + backup) + for i in 1 2; do + run ovsdb-tool create $db$i.db "$schema" + done + ovn_start_ovsdb_server 1 ${db}1.db + ovn_start_ovsdb_server 2 --sync-from=unix:${db}1.ovsdb + remote=unix:${db}1.ovsdb + backup_note="$backup_note +The backup server of OVN $DB can be accessed by: +* ovn-${db}ctl --db=unix:`pwd`/sandbox/${db}2.ovsdb +* ovs-appctl -t `pwd`/sandbox/${db}2 +The backup database file is sandbox/${db}2.db +" + ;; + clustered) + for i in $(seq $servers); do + if test $i = 1; then + run ovsdb-tool create-cluster ${db}1.db "$schema" unix:${db}1.raft; + else + run ovsdb-tool join-cluster $db$i.db $schema_name unix:$db$i.raft unix:${db}1.raft + fi + ovn_start_ovsdb_server $i + done + remote=unix:${db}1.ovsdb + for i in `seq 2 $n`; do + remote=$remote,unix:$db$i.ovsdb + done + for i in $(seq $servers); do + run ovsdb-client wait unix:$db$i.ovsdb $schema_name connected + done + ;; + esac + eval OVN_${DB}_DB=\$remote + eval export OVN_${DB}_DB + } + + backup_note= + ovn_start_db nb "$nbdb_model" "$nbdb_servers" "$ovnnb_schema" + ovn_start_db sb "$sbdb_model" "$sbdb_servers" "$ovnsb_schema" fi #Add a small delay to allow ovsdb-server to launch. @@ -420,7 +505,7 @@ if $ovn; then ovs-vsctl set open . external-ids:ovn-remote=ssl:127.0.0.1:6642 OVN_CTRLR_PKI="-p $sandbox/chassis-1-privkey.pem -c $sandbox/chassis-1-cert.pem -C $sandbox/pki/switchca/cacert.pem" else - ovs-vsctl set open . external-ids:ovn-remote=unix:"$sandbox"/ovnsb_db.sock + ovs-vsctl set open . external-ids:ovn-remote=$OVN_SB_DB OVN_CTRLR_PKI="" fi rungdb $gdb_ovn_northd $gdb_ovn_northd_ex ovn-northd --detach \ @@ -447,13 +532,7 @@ EOF if $ovn; then cat << EOF This environment also has the OVN daemons and databases enabled. You can use ovn-nbctl and ovn-sbctl to interact with the OVN databases. - -The backup server of OVN SB can be accessed by: -* ovn-sbctl --db=unix:`pwd`/sandbox/ovnsb_db2.sock -* ovs-appctl -t `pwd`/sandbox/sb_backup_unixctl -The backup database file is "sandbox"/ovnsb2.db - - +$backup_note EOF fi cat <