[ovs-dev,v2,3/4] Add file descriptor persistence where possible
diff mbox series

Message ID 20200214175429.26111-3-anton.ivanov@cambridgegreys.com
State New
Headers show
Series
  • [ovs-dev,v2,1/4] Replace direct use of POLLXXX macros with OVS_POLLXXX
Related show

Commit Message

Anton Ivanov Feb. 14, 2020, 5:54 p.m. UTC
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

1. Adds "persistent" behaviour where feasible (streams and signals).
These are waited upon in the same thread where they are created. This
allows them to be registered persistently with the OS (if possible)
as well as the OS to provide hints - is the FD ready, is it closed,
etc.

2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
which is not ready if that fd has been registered as "private" to the
thread which waits upon it.

3. No longer breaks other parts of OVS which create the fd in one
thread and waits upon it in others.

3. Adds support for EPOLL on Linux and can be expanded to cover similar
poll++ frameworks in other OSes.

4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
using a "centeral (e)poll dispatcher + IO threads" pattern

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 include/openvswitch/poll-loop.h |  56 +++++-
 lib/dpif-netlink.c              |   6 +-
 lib/fatal-signal.c              |   7 +-
 lib/latch-unix.c                |   3 +-
 lib/netdev-afxdp.c              |   2 +-
 lib/poll-loop.c                 | 320 ++++++++++++++++++++++++--------
 lib/route-table-bsd.c           |   1 +
 lib/stream-fd.c                 |  62 ++++++-
 lib/stream-ssl.c                |  50 ++++-
 lib/timeval.c                   |  83 +++++++++
 lib/timeval.h                   |   7 +
 11 files changed, 508 insertions(+), 89 deletions(-)

Comments

Anton Ivanov Feb. 14, 2020, 5:55 p.m. UTC | #1
On 14/02/2020 17:54, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> 1. Adds "persistent" behaviour where feasible (streams and signals).
> These are waited upon in the same thread where they are created. This
> allows them to be registered persistently with the OS (if possible)
> as well as the OS to provide hints - is the FD ready, is it closed,
> etc.

I will amend the commit message in the next version - it is a bit misleading. signals are a global pipe which allows them to be persistent too.

> 
> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
> which is not ready if that fd has been registered as "private" to the
> thread which waits upon it.
> 
> 3. No longer breaks other parts of OVS which create the fd in one
> thread and waits upon it in others.
> 
> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
> poll++ frameworks in other OSes.
> 
> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
> using a "centeral (e)poll dispatcher + IO threads" pattern
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>   include/openvswitch/poll-loop.h |  56 +++++-
>   lib/dpif-netlink.c              |   6 +-
>   lib/fatal-signal.c              |   7 +-
>   lib/latch-unix.c                |   3 +-
>   lib/netdev-afxdp.c              |   2 +-
>   lib/poll-loop.c                 | 320 ++++++++++++++++++++++++--------
>   lib/route-table-bsd.c           |   1 +
>   lib/stream-fd.c                 |  62 ++++++-
>   lib/stream-ssl.c                |  50 ++++-
>   lib/timeval.c                   |  83 +++++++++
>   lib/timeval.h                   |   7 +
>   11 files changed, 508 insertions(+), 89 deletions(-)
> 
> diff --git a/include/openvswitch/poll-loop.h b/include/openvswitch/poll-loop.h
> index 532d9caa6..6d0331f6d 100644
> --- a/include/openvswitch/poll-loop.h
> +++ b/include/openvswitch/poll-loop.h
> @@ -41,11 +41,30 @@
>   #include <windows.h>
>   #endif
>   
> +#ifdef __linux__
> +#define OVS_USE_EPOLL
> +#endif
> +
> +#ifdef OVS_USE_EPOLL
> +#include <sys/epoll.h>
> +
> +#define OVS_POLLIN EPOLLIN
> +#define OVS_POLLOUT EPOLLOUT
> +#define OVS_POLLERR EPOLLERR
> +#define OVS_POLLHUP EPOLLHUP
> +#define OVS_ONESHOT EPOLLONESHOT
> +#define OVS_POLLNVAL 0
> +
> +#else
> +
>   #define OVS_POLLIN POLLIN
>   #define OVS_POLLOUT POLLOUT
>   #define OVS_POLLERR POLLERR
>   #define OVS_POLLNVAL POLLNVAL
>   #define OVS_POLLHUP POLLHUP
> +#define OVS_ONESHOT (1U << 30)
> +
> +#endif
>   
>   #ifdef  __cplusplus
>   extern "C" {
> @@ -60,10 +79,43 @@ extern "C" {
>    * the source code location of the caller.  The function version allows the
>    * caller to supply a location explicitly, which is useful if the caller's own
>    * caller would be more useful in log output.  See timer_wait_at() for an
> - * example. */
> -void poll_fd_wait_at(int fd, short int events, const char *where);
> + * example.
> + * Note - using on fds registered using poll_fd_register() will generate a
> + * warning as this is not an intended use.
> + */
> +void poll_fd_wait_at(int fd, int events, const char *where);
>   #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>   
> +/* Register a fd with a persistence framework if available so it can be served
> + * "faster" and the caller can be provided with "hints" on what caused the IO
> + * event.
> + * If the "hint" argument is supplied it set to point to the pollfd structure
> + * containing the events passed by the OS in .revents.
> + * Note - as the frameworks are OS dependent, the events are limited to what
> + * can be passed in a .revents which is a short int.
> + * Limitations - MUST BE registered from the same thread as the one where
> + * it will be waited upon.
> + */
> +
> +void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);
> +#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)
> +
> +/* De-register a fd which was registered as "private" with the persistence
> + * framework
> + */
> +
> +void poll_fd_deregister_at(int fd, const char *where);
> +#define poll_fd_deregister(fd) poll_fd_deregister_at(fd, OVS_SOURCE_LOCATOR)
> +
> +/* Schedule events to wake up the following poll_block() - "private fds"
> + * Same as poll_fd_wait, but for fds which have been registered and are
> + * expected to persist. If a "fast" OS fd notification framework is used
> + * this version of wait may be a NOOP (f.e. for (E)POLLIN events.
> + */
> +void private_poll_fd_wait_at(int fd, int events, const char *where);
> +#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
> +
> +
>   #ifdef _WIN32
>   void poll_wevent_wait_at(HANDLE wevent, const char *where);
>   #define poll_wevent_wait(wevent) poll_wevent_wait_at(wevent, OVS_SOURCE_LOCATOR)
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 5b5c96d72..ad5db9452 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1289,7 +1289,7 @@ dpif_netlink_port_poll_wait(const struct dpif *dpif_)
>       const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>   
>       if (dpif->port_notifier) {
> -        nl_sock_wait(dpif->port_notifier, POLLIN);
> +        nl_sock_wait(dpif->port_notifier, OVS_POLLIN);
>       } else {
>           poll_immediate_wake();
>       }
> @@ -2756,13 +2756,13 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
>       }
>   
>       for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
> -        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
> +        nl_sock_wait(sock_pool[i].nl_sock, OVS_POLLIN);
>       }
>   #else
>       if (dpif->handlers && handler_id < dpif->n_handlers) {
>           struct dpif_handler *handler = &dpif->handlers[handler_id];
>   
> -        poll_fd_wait(handler->epoll_fd, POLLIN);
> +        poll_fd_wait(handler->epoll_fd, OVS_POLLIN);
>       }
>   #endif
>   }
> diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
> index 97d8d1dab..424636e07 100644
> --- a/lib/fatal-signal.c
> +++ b/lib/fatal-signal.c
> @@ -96,6 +96,7 @@ fatal_signal_init(void)
>           ovs_mutex_init_recursive(&mutex);
>   #ifndef _WIN32
>           xpipe_nonblocking(signal_fds);
> +        poll_fd_register(signal_fds[0], OVS_POLLIN, NULL);
>   #else
>           wevent = CreateEvent(NULL, TRUE, FALSE, NULL);
>           if (!wevent) {
> @@ -236,9 +237,12 @@ void
>   fatal_signal_run(void)
>   {
>       sig_atomic_t sig_nr;
> +    char sigbuffer[_POSIX_PIPE_BUF];
>   
>       fatal_signal_init();
>   
> +    read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
> +
>       sig_nr = stored_sig_nr;
>       if (sig_nr != SIG_ATOMIC_MAX) {
>           char namebuf[SIGNAL_NAME_BUFSIZE];
> @@ -271,7 +275,8 @@ fatal_signal_wait(void)
>   #ifdef _WIN32
>       poll_wevent_wait(wevent);
>   #else
> -    poll_fd_wait(signal_fds[0], OVS_POLLIN);
> +    /* a noop - schedule for removal */
> +    private_poll_fd_wait(signal_fds[0], OVS_POLLIN);
>   #endif
>   }
>   
> diff --git a/lib/latch-unix.c b/lib/latch-unix.c
> index fea61ab28..5f15b59fe 100644
> --- a/lib/latch-unix.c
> +++ b/lib/latch-unix.c
> @@ -83,5 +83,6 @@ latch_is_set(const struct latch *latch)
>   void
>   latch_wait_at(const struct latch *latch, const char *where)
>   {
> -    poll_fd_wait_at(latch->fds[0], OVS_POLLIN, where);
> +    /* Ask for wait and make it one-shot if persistence is in play */
> +    poll_fd_wait_at(latch->fds[0], OVS_POLLIN | OVS_ONESHOT, where);
>   }
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index ef367e5ea..482400d8d 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -184,7 +184,7 @@ xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
>   
>       if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
>           pfd.fd = fd;
> -        pfd.events = OVS_POLLIN;
> +        pfd.events = POLLIN;
>   
>           ret = poll(&pfd, 1, 0);
>           if (OVS_UNLIKELY(ret < 0)) {
> diff --git a/lib/poll-loop.c b/lib/poll-loop.c
> index 3902d6c1f..10a5b0c01 100644
> --- a/lib/poll-loop.c
> +++ b/lib/poll-loop.c
> @@ -18,6 +18,12 @@
>   #include "openvswitch/poll-loop.h"
>   #include <errno.h>
>   #include <inttypes.h>
> +#ifdef OVS_USE_EPOLL
> +#include <sys/epoll.h>
> +#endif
> +#ifndef _WIN32
> +#include <unistd.h>
> +#endif
>   #include <poll.h>
>   #include <stdlib.h>
>   #include <string.h>
> @@ -31,7 +37,9 @@
>   #include "timeval.h"
>   #include "openvswitch/vlog.h"
>   #include "openvswitch/hmap.h"
> +#include "openvswitch/list.h"
>   #include "hash.h"
> +#include "ovs-atomic.h"
>   
>   VLOG_DEFINE_THIS_MODULE(poll_loop);
>   
> @@ -43,21 +51,32 @@ struct poll_node {
>       struct pollfd pollfd;       /* Events to pass to time_poll(). */
>       HANDLE wevent;              /* Events for WaitForMultipleObjects(). */
>       const char *where;          /* Where poll_node was created. */
> +    bool valid;                 /* Can it be used? */
> +    bool private;               /* Can we assume that it is only in this thread poll loop? */
>   };
>   
> +#define MAX_EPOLL_EVENTS 64
> +
>   struct poll_loop {
> -    /* All active poll waiters. */
> +    /* List of all poll loops in the system */
> +    struct ovs_mutex loop_mutex;
> +    /* All poll waiters for this poll loop */
>       struct hmap poll_nodes;
>   
>       /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
>        * wake up immediately, or LLONG_MAX to wait forever. */
>       long long int timeout_when; /* In msecs as returned by time_msec(). */
>       const char *timeout_where;  /* Where 'timeout_when' was set. */
> +#ifdef OVS_USE_EPOLL
> +    int epoll_fd;
> +    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
> +#endif
>   };
>   
> +
>   static struct poll_loop *poll_loop(void);
>   
> -/* Look up the node with same fd or wevent. */
> +/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */
>   static struct poll_node *
>   find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>   {
> @@ -76,79 +95,142 @@ find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>       }
>       return NULL;
>   }
> -
> -/* On Unix based systems:
> - *
> - *     Registers 'fd' as waiting for the specified 'events' (which should be
> - *     OVS_POLLIN or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to
> - *     poll_block() will wake up when 'fd' becomes ready for one or more of the
> - *     requested events. The 'fd's are given to poll() function later.
> - *
> - * On Windows system:
> +/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
> + * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
> + * wake up when 'fd' becomes ready for one or more of the requested events.
>    *
> - *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
> - *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
> - *     it is assumed that it is unrelated to any sockets and poll_block()
> - *     will wake up on any event on that 'wevent'. It is an error to pass
> - *     both 'wevent' and 'fd'.
> + * The event registration is PERSISTENT. This is intended for OSes which have a persistent
> + * event framework. For now it is implemented only for epoll and Linux, other
> + * implementations such as BSD kqueue and Solaris /dev/poll may follow.
>    *
> - * The event registration is one-shot: only the following call to
> - * poll_block() is affected.  The event will need to be re-registered after
> - * poll_block() is called if it is to persist.
> + * If the OS has no persistent even framework does nothing
>    *
>    * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>    * automatically provide the caller's source file and line number for
>    * 'where'.) */
> +
>   static void
> -poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
> +poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)
>   {
>       struct poll_loop *loop = poll_loop();
>       struct poll_node *node;
> +#ifdef OVS_USE_EPOLL
> +    struct epoll_event event;
> +#endif
>   
> -    COVERAGE_INC(poll_create_node);
> -
> -    /* Both 'fd' and 'wevent' cannot be set. */
>       ovs_assert(!fd != !wevent);
>   
> +    /* This is mostly uncontended, so the thread should grab it straight away.
> +     * We will reuse it later to introduce threading for IO and SSL
> +     */
> +    ovs_mutex_lock(&loop->loop_mutex);
> +
>       /* Check for duplicate.  If found, "or" the events. */
>       node = find_poll_node(loop, fd, wevent);
> -    if (node) {
> -        node->pollfd.events |= events;
> -    } else {
> -        node = xzalloc(sizeof *node);
> -        hmap_insert(&loop->poll_nodes, &node->hmap_node,
> -                    hash_2words(fd, (uint32_t)wevent));
> -        node->pollfd.fd = fd;
> -        node->pollfd.events = events;
> -#ifdef _WIN32
> -        if (!wevent) {
> -            wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
> +
> +    if (node && node->valid) {
> +#ifdef OVS_USE_EPOLL
> +        int old_event_mask = node->pollfd.events;
> +#endif
> +        /* If there is an existing event mask we do not need to inc - this will be waited upon */
> +        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */
> +
> +#ifdef OVS_USE_EPOLL
> +        /* modify existing epoll entry if there is an epoll specific ask or if the
> +         * mask has changed
> +         */
> +        if ((events & 0xFFFF0000) || (old_event_mask != node->pollfd.events)) {
> +            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;
> +            event.data.ptr = node;
> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
>           }
>   #endif
> +    } else {
> +        if (!node) {
> +            node = xzalloc(sizeof *node);
> +            hmap_insert(&loop->poll_nodes, &node->hmap_node,
> +                        hash_2words(fd, 0));
> +        } else {
> +            /* node marked for reaping, OS has reused the fd number, valid is set to false */
> +#ifdef OVS_USE_EPOLl
> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, fd, NULL);
> +#endif
> +        }
> +        node->pollfd.fd = fd;
> +        node->pollfd.events = (events & 0x0000FFFF);
>           node->wevent = wevent;
>           node->where = where;
> +        node->valid = true;
> +        node->private = private;
> +#ifdef OVS_USE_EPOLL
> +        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */
> +        event.data.ptr = node;
> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
> +#endif
> +    }
> +    if (hint) {
> +        *hint = &node->pollfd;
>       }
> +    ovs_mutex_unlock(&loop->loop_mutex);
> +}
> +
> +void
> +poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {
> +    poll_fd_subscribe_at(fd, 0, events, hint, where , true);
> +}
> +
> +/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)
> + * but it is not.
> + * In order to be compatible with existing calling conventions while using fd persistence
> + * where supported we have to keep "legacy" fds around for the duration of the life of
> + * the thread because we have no idea if they have been reaped properly or not.
> + * The reason for this is that for some of them the close() is in a thread different from the
> + * poll loop.
> + * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the
> + * same fd number, we will reuse the existing has entry.
> + */
> +
> +void
> +poll_fd_deregister_at(int fd, const char *where) {
> +    struct poll_loop *loop = poll_loop();
> +
> +    VLOG(VLL_DBG, "Deregister %d from %s", fd, where);
> +    struct poll_node *node;
> +
> +    ovs_mutex_lock(&loop->loop_mutex);
> +    node = find_poll_node(loop, fd, 0);
> +    if (node) {
> +        if (node->private) {
> +#ifdef OVN_USE_EPOLL
> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
> +#endif
> +            hmap_remove(&loop->poll_nodes, &node->hmap_node);
> +        } else {
> +            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);
> +            node->valid = false;
> +        }
> +    }
> +    ovs_mutex_unlock(&loop->loop_mutex);
> +}
> +
> +void
> +poll_fd_wait_at(int fd, int events, const char *where)
> +{
> +    poll_fd_subscribe_at(fd, 0, events, NULL, where, false);
>   }
>   
> -/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
> - * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
> - * wake up when 'fd' becomes ready for one or more of the requested events.
> - *
> - * On Windows, 'fd' must be a socket.
> - *
> - * The event registration is one-shot: only the following call to poll_block()
> - * is affected.  The event will need to be re-registered after poll_block() is
> - * called if it is to persist.
> - *
> - * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
> - * automatically provide the caller's source file and line number for
> - * 'where'.) */
>   void
> -poll_fd_wait_at(int fd, short int events, const char *where)
> +private_poll_fd_wait_at(int fd, int events, const char *where)
>   {
> -    poll_create_node(fd, 0, events, where);
> +    /* POLLIN persists on "private" fds - either emulated or at epoll
> +     * or other persistence framework level
> +     */
> +    if (events & (~OVS_POLLIN)) {
> +        poll_fd_subscribe_at(fd, 0, events, NULL, where, true);
> +    }
>   }
>   
> +
>   #ifdef _WIN32
>   /* Registers for the next call to poll_block() to wake up when 'wevent' is
>    * signaled.
> @@ -163,7 +245,7 @@ poll_fd_wait_at(int fd, short int events, const char *where)
>   void
>   poll_wevent_wait_at(HANDLE wevent, const char *where)
>   {
> -    poll_create_node(0, wevent, 0, where);
> +    poll_fd_subscribe_at(0, wevent, 0, NULL, where);
>   }
>   #endif /* _WIN32 */
>   
> @@ -277,9 +359,12 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>           if (pollfd->revents & OVS_POLLHUP) {
>               ds_put_cstr(&s, "[OVS_POLLHUP]");
>           }
> +#ifndef OVS_USE_EPOLL
> +        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/
>           if (pollfd->revents & OVS_POLLNVAL) {
>               ds_put_cstr(&s, "[OVS_POLLNVAL]");
>           }
> +#endif
>           ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
>           free(description);
>       } else {
> @@ -295,12 +380,17 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>       ds_destroy(&s);
>   }
>   
> +
>   static void
>   free_poll_nodes(struct poll_loop *loop)
>   {
>       struct poll_node *node, *next;
>   
> +    ovs_mutex_lock(&loop->loop_mutex);
>       HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
> +#ifdef OVS_USE_EPOLL
> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
> +#endif
>           hmap_remove(&loop->poll_nodes, &node->hmap_node);
>   #ifdef _WIN32
>           if (node->wevent && node->pollfd.fd) {
> @@ -310,6 +400,7 @@ free_poll_nodes(struct poll_loop *loop)
>   #endif
>           free(node);
>       }
> +    ovs_mutex_unlock(&loop->loop_mutex);
>   }
>   
>   /* Blocks until one or more of the events registered with poll_fd_wait()
> @@ -320,8 +411,13 @@ poll_block(void)
>   {
>       struct poll_loop *loop = poll_loop();
>       struct poll_node *node;
> +#ifndef OVS_USE_EPOLL
>       struct pollfd *pollfds;
> +#endif
> +#ifndef OVS_USE_EPOLL
>       HANDLE *wevents = NULL;
> +    int counter;
> +#endif
>       int elapsed;
>       int retval;
>       int i;
> @@ -335,54 +431,126 @@ poll_block(void)
>       }
>   
>       timewarp_run();
> -    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>   
> +#ifdef OVS_USE_EPOLL
> +    retval = time_epoll_wait(loop->epoll_fd,
> +        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
> +    if (retval < 0) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_ERR_RL(&rl, "epoll: %s", ovs_strerror(retval));
> +    } else if (!retval) {
> +        log_wakeup(loop->timeout_where, NULL, elapsed);
> +    } else {
> +        ovs_mutex_lock(&loop->loop_mutex);
> +        if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> +            for (i = 0; i < retval; i++) {
> +                node = (struct poll_node *) loop->epoll_events[i].data.ptr;
> +                if (loop->epoll_events[i].events) {
> +                    node->pollfd.revents = loop->epoll_events[i].events;
> +                    log_wakeup(node->where, &node->pollfd, 0);
> +                }
> +            }
> +        }
> +        for (i = 0; i < retval; i++) {
> +            node = (struct poll_node *) loop->epoll_events[i].data.ptr;
> +            if (loop->epoll_events[i].events & EPOLLHUP) {
> +                /* File descriptor closed already elsewhere
> +                 * We have to make the assumption that whoever closed it has
> +                 * ensured that anything which refers to IO event hints will not run
> +                 * on this fd after we free it.
> +                 */
> +                node->valid = false;
> +            }
> +            if (loop->epoll_events[i].events) {
> +                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);
> +            }
> +            if (loop->epoll_events[i].events & OVS_POLLOUT) {
> +                struct epoll_event event;
> +                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */
> +                event.events = node->pollfd.events;
> +                event.data.ptr = node;
> +                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);
> +            }
> +        }
> +        ovs_mutex_unlock(&loop->loop_mutex);
> +    }
> +#else
> +    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>   #ifdef _WIN32
>       wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
>   #endif
>   
> +
>       /* Populate with all the fds and events. */
> -    i = 0;
> +    counter = 0;
>       HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
> -        pollfds[i] = node->pollfd;
> +        if ((node->valid) && (node->pollfd.events)) {
> +            pollfds[counter] = node->pollfd;
>   #ifdef _WIN32
> -        wevents[i] = node->wevent;
> -        if (node->pollfd.fd && node->wevent) {
> -            short int wsa_events = 0;
> -            if (node->pollfd.events & OVS_POLLIN) {
> -                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
> +            wevents[counter] = node->wevent;
> +            if (node->pollfd.fd && node->wevent) {
> +                short int wsa_events = 0;
> +                if (node->pollfd.events & OVS_POLLIN) {
> +                    wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
> +                }
> +                if (node->pollfd.events & OVS_POLLOUT) {
> +                    wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
> +                }
> +                WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>               }
> -            if (node->pollfd.events & OVS_POLLOUT) {
> -                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
> -            }
> -            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
> -        }
>   #endif
> -        i++;
> +            counter++;
> +        }
>       }
>   
> -    retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
> +    retval = time_poll(pollfds, counter, wevents,
>                          loop->timeout_when, &elapsed);
>       if (retval < 0) {
>           static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>           VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
> -    } else if (!retval) {
> +    } else if (retval == 0) {
>           log_wakeup(loop->timeout_where, NULL, elapsed);
> -    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> -        i = 0;
> -        HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
> +    } else {
> +        for (i = 0; i < counter; i++) {
>               if (pollfds[i].revents) {
> -                log_wakeup(node->where, &pollfds[i], 0);
> +
> +                node = find_poll_node(loop, pollfds[i].fd, 0);
> +
> +                if (!node) {
> +                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
> +                }
> +                if (pollfds[i].revents & (OVS_POLLHUP | OVS_POLLNVAL)) {
> +                    node->valid = false;
> +                }
> +
> +                if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> +                    log_wakeup(node->where, &pollfds[i], 0);
> +                }
> +                /* update "requested" events.
> +                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc
> +                 * behaviour which they should be using in real life instead of using poll()
> +                 */
> +                if (node->private) {
> +                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));
> +                } else {
> +                    node->pollfd.events &= ~pollfds[i].revents;
> +                }
> +                /* update "occured" events for use by streams and handlers. In case there
> +                 * is an existing (but not consumed yet) event, we OR the events in the
> +                 * stored record with the new ones - it is the job of the stream to clear
> +                 * that.
> +                 */
> +                node->pollfd.revents |= pollfds[i].revents;
>               }
> -            i++;
>           }
>       }
>   
> -    free_poll_nodes(loop);
> +    free(pollfds);
> +    if (wevents)
> +        free(wevents);
> +#endif
>       loop->timeout_when = LLONG_MAX;
>       loop->timeout_where = NULL;
> -    free(pollfds);
> -    free(wevents);
>   
>       /* Handle any pending signals before doing anything else. */
>       fatal_signal_run();
> @@ -416,8 +584,12 @@ poll_loop(void)
>       if (!loop) {
>           loop = xzalloc(sizeof *loop);
>           loop->timeout_when = LLONG_MAX;
> +        ovs_mutex_init(&loop->loop_mutex);
>           hmap_init(&loop->poll_nodes);
>           xpthread_setspecific(key, loop);
> +#ifdef OVS_USE_EPOLL
> +        loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
> +#endif
>       }
>       return loop;
>   }
> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
> index 3dfa80c7f..16d155989 100644
> --- a/lib/route-table-bsd.c
> +++ b/lib/route-table-bsd.c
> @@ -34,6 +34,7 @@
>   #include "ovs-router.h"
>   #include "packets.h"
>   #include "openvswitch/vlog.h"
> +#include "openvswitch/poll-loop.h"
>   #include "util.h"
>   
>   VLOG_DEFINE_THIS_MODULE(route_table_bsd);
> diff --git a/lib/stream-fd.c b/lib/stream-fd.c
> index 62f768d45..6a80d6e05 100644
> --- a/lib/stream-fd.c
> +++ b/lib/stream-fd.c
> @@ -40,6 +40,8 @@ struct stream_fd
>       struct stream stream;
>       int fd;
>       int fd_type;
> +    bool rx_ready, tx_ready;
> +    struct pollfd *hint;
>   };
>   
>   static const struct stream_class stream_fd_class;
> @@ -67,7 +69,14 @@ new_fd_stream(char *name, int fd, int connect_status, int fd_type,
>       stream_init(&s->stream, &stream_fd_class, connect_status, name);
>       s->fd = fd;
>       s->fd_type = fd_type;
> +    s->rx_ready = true;
> +    s->tx_ready = true;
> +    s->hint = NULL;
>       *streamp = &s->stream;
> +    /* Persistent registration - we always get POLLINs from now on,
> +     * POLLOUTs when we ask for them
> +     */
> +    poll_fd_register(s->fd, OVS_POLLIN, &s->hint);
>       return 0;
>   }
>   
> @@ -82,6 +91,8 @@ static void
>   fd_close(struct stream *stream)
>   {
>       struct stream_fd *s = stream_fd_cast(stream);
> +    /* Deregister the FD from any persistent registrations if supported */
> +    poll_fd_deregister(s->fd);
>       closesocket(s->fd);
>       free(s);
>   }
> @@ -104,6 +115,24 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>       ssize_t retval;
>       int error;
>   
> +    if (s->hint) {
> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
> +         * for all other cases we belive what (e)poll has fed us.
> +         */
> +        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {
> +            if (!(s->hint->revents & OVS_POLLIN)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready */
> +                s->rx_ready = true;
> +                s->hint->revents &= ~OVS_POLLIN;
> +            }
> +        } else {
> +            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
> +        }
> +    }
> +
>       retval = recv(s->fd, buffer, n, 0);
>       if (retval < 0) {
>           error = sock_errno();
> @@ -114,6 +143,8 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>   #endif
>           if (error != EAGAIN) {
>               VLOG_DBG_RL(&rl, "recv: %s", sock_strerror(error));
> +        } else {
> +            s->rx_ready = false;
>           }
>           return -error;
>       }
> @@ -127,9 +158,29 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>       ssize_t retval;
>       int error;
>   
> +    if (s->hint) {
> +        /* poll-loop is providing us with hints for IO */
> +        if (!s->tx_ready) {
> +            if (!(s->hint->revents & OVS_POLLOUT)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLOUT event from poll loop, mark us as ready */
> +                s->tx_ready = true;
> +                s->hint->revents &= ~OVS_POLLOUT;
> +            }
> +        }
> +    }
>       retval = send(s->fd, buffer, n, 0);
>       if (retval < 0) {
>           error = sock_errno();
> +#ifdef __linux__
> +        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen
> +         *  on unix domain sockets
> +         */
> +        if (error == ENOBUFS) {
> +           error = EAGAIN;
> +        }
> +#endif
>   #ifdef _WIN32
>           if (error == WSAEWOULDBLOCK) {
>              error = EAGAIN;
> @@ -137,6 +188,8 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>   #endif
>           if (error != EAGAIN) {
>               VLOG_DBG_RL(&rl, "send: %s", sock_strerror(error));
> +        } else {
> +            s->tx_ready = false;
>           }
>           return -error;
>       }
> @@ -150,11 +203,11 @@ fd_wait(struct stream *stream, enum stream_wait_type wait)
>       switch (wait) {
>       case STREAM_CONNECT:
>       case STREAM_SEND:
> -        poll_fd_wait(s->fd, OVS_POLLOUT);
> +        private_poll_fd_wait(s->fd, OVS_POLLOUT);
>           break;
>   
>       case STREAM_RECV:
> -        poll_fd_wait(s->fd, OVS_POLLIN);
> +        private_poll_fd_wait(s->fd, OVS_POLLIN);
>           break;
>   
>       default:
> @@ -223,6 +276,8 @@ new_fd_pstream(char *name, int fd,
>       ps->accept_cb = accept_cb;
>       ps->unlink_path = unlink_path;
>       *pstreamp = &ps->pstream;
> +    /* persistent registration */
> +    poll_fd_register(ps->fd, OVS_POLLIN, NULL);
>       return 0;
>   }
>   
> @@ -230,6 +285,7 @@ static void
>   pfd_close(struct pstream *pstream)
>   {
>       struct fd_pstream *ps = fd_pstream_cast(pstream);
> +    poll_fd_deregister(ps->fd);
>       closesocket(ps->fd);
>       maybe_unlink_and_free(ps->unlink_path);
>       free(ps);
> @@ -271,7 +327,7 @@ static void
>   pfd_wait(struct pstream *pstream)
>   {
>       struct fd_pstream *ps = fd_pstream_cast(pstream);
> -    poll_fd_wait(ps->fd, OVS_POLLIN);
> +    private_poll_fd_wait(ps->fd, OVS_POLLIN);
>   }
>   
>   static const struct pstream_class fd_pstream_class = {
> diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
> index 3b7f9865e..53ae51c1b 100644
> --- a/lib/stream-ssl.c
> +++ b/lib/stream-ssl.c
> @@ -147,6 +147,7 @@ struct ssl_stream
>       /* A few bytes of header data in case SSL negotiation fails. */
>       uint8_t head[2];
>       short int n_head;
> +    struct pollfd *hint;
>   };
>   
>   /* SSL context created by ssl_init(). */
> @@ -310,6 +311,8 @@ new_ssl_stream(char *name, char *server_name, int fd, enum session_type type,
>           SSL_set_msg_callback_arg(ssl, sslv);
>       }
>   
> +
> +    poll_fd_register(sslv->fd, OVS_POLLIN, &sslv->hint);
>       *streamp = &sslv->stream;
>       free(server_name);
>       return 0;
> @@ -604,6 +607,7 @@ ssl_close(struct stream *stream)
>       ERR_clear_error();
>   
>       SSL_free(sslv->ssl);
> +    poll_fd_deregister(sslv->fd);
>       closesocket(sslv->fd);
>       free(sslv);
>   }
> @@ -697,6 +701,27 @@ ssl_recv(struct stream *stream, void *buffer, size_t n)
>       /* Behavior of zero-byte SSL_read is poorly defined. */
>       ovs_assert(n > 0);
>   
> +     if (sslv->hint) {
> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
> +         * for all other cases we belive what (e)poll has fed us.
> +         */
> +        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {
> +            if (!(sslv->hint->revents & OVS_POLLIN)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready
> +                 * rx_want is cleared further down by reading ssl fsm
> +                 */
> +                sslv->hint->revents &= ~OVS_POLLIN;
> +            }
> +        } else {
> +            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
> +        }
> +    }
> +
> +
> +
>       old_state = SSL_get_state(sslv->ssl);
>       ret = SSL_read(sslv->ssl, buffer, n);
>       if (old_state != SSL_get_state(sslv->ssl)) {
> @@ -729,6 +754,19 @@ ssl_do_tx(struct stream *stream)
>   {
>       struct ssl_stream *sslv = ssl_stream_cast(stream);
>   
> +     if (sslv->hint) {
> +        /* poll-loop is providing us with hints for IO */
> +        if (sslv->tx_want == SSL_WRITING) {
> +            if (!(sslv->hint->revents & OVS_POLLOUT)) {
> +                return EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready
> +                 * rx_want is cleared further down by reading ssl fsm
> +                 */
> +                sslv->hint->revents &= ~OVS_POLLOUT;
> +            }
> +        }
> +    }
>       for (;;) {
>           int old_state = SSL_get_state(sslv->ssl);
>           int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
> @@ -771,6 +809,8 @@ ssl_send(struct stream *stream, const void *buffer, size_t n)
>               ssl_clear_txbuf(sslv);
>               return n;
>           case EAGAIN:
> +            /* we want to know when this fd will become available again */
> +            stream_send_wait(stream);
>               return n;
>           default:
>               ssl_clear_txbuf(sslv);
> @@ -795,7 +835,7 @@ ssl_run_wait(struct stream *stream)
>       struct ssl_stream *sslv = ssl_stream_cast(stream);
>   
>       if (sslv->tx_want != SSL_NOTHING) {
> -        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
> +        private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>       }
>   }
>   
> @@ -811,13 +851,13 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>           } else {
>               switch (sslv->state) {
>               case STATE_TCP_CONNECTING:
> -                poll_fd_wait(sslv->fd, OVS_POLLOUT);
> +                private_poll_fd_wait(sslv->fd, OVS_POLLOUT);
>                   break;
>   
>               case STATE_SSL_CONNECTING:
>                   /* ssl_connect() called SSL_accept() or SSL_connect(), which
>                    * set up the status that we test here. */
> -                poll_fd_wait(sslv->fd,
> +                private_poll_fd_wait(sslv->fd,
>                                  want_to_poll_events(SSL_want(sslv->ssl)));
>                   break;
>   
> @@ -829,7 +869,7 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>   
>       case STREAM_RECV:
>           if (sslv->rx_want != SSL_NOTHING) {
> -            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
> +            private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>           } else {
>               poll_immediate_wake();
>           }
> @@ -911,6 +951,7 @@ pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
>                    ds_steal_cstr(&bound_name));
>       pstream_set_bound_port(&pssl->pstream, htons(port));
>       pssl->fd = fd;
> +    poll_fd_register(fd, OVS_POLLIN, NULL);
>       *pstreamp = &pssl->pstream;
>   
>       return 0;
> @@ -920,6 +961,7 @@ static void
>   pssl_close(struct pstream *pstream)
>   {
>       struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
> +    poll_fd_deregister(pssl->fd);
>       closesocket(pssl->fd);
>       free(pssl);
>   }
> diff --git a/lib/timeval.c b/lib/timeval.c
> index 193c7bab1..59a12414f 100644
> --- a/lib/timeval.c
> +++ b/lib/timeval.c
> @@ -38,6 +38,7 @@
>   #include "unixctl.h"
>   #include "util.h"
>   #include "openvswitch/vlog.h"
> +#include "openvswitch/poll-loop.h"
>   
>   VLOG_DEFINE_THIS_MODULE(timeval);
>   
> @@ -369,6 +370,88 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED,
>       return retval;
>   }
>   
> +#ifdef OVS_USE_EPOLL
> +
> +/* Like epoll_wait(), except:
> + *
> + *      - The timeout is specified as an absolute time, as defined by
> + *        time_msec(), instead of a duration.
> + *
> + *      - On error, returns a negative error code (instead of setting errno).
> + *
> + *      - If interrupted by a signal, retries automatically until the original
> + *        timeout is reached.  (Because of this property, this function will
> + *        never return -EINTR.)
> + *
> + * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
> +int
> +time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
> +          long long int timeout_when, int *elapsed)
> +{
> +    long long int *last_wakeup = last_wakeup_get();
> +    long long int start;
> +    bool quiescent;
> +    int retval = 0;
> +
> +    time_init();
> +    coverage_clear();
> +    coverage_run();
> +    if (*last_wakeup && !thread_is_pmd()) {
> +        log_poll_interval(*last_wakeup);
> +    }
> +    start = time_msec();
> +
> +    timeout_when = MIN(timeout_when, deadline);
> +    quiescent = ovsrcu_is_quiescent();
> +
> +    for (;;) {
> +        long long int now = time_msec();
> +        int time_left;
> +
> +        if (now >= timeout_when) {
> +            time_left = 0;
> +        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
> +            time_left = INT_MAX;
> +        } else {
> +            time_left = timeout_when - now;
> +        }
> +
> +        if (!quiescent) {
> +            if (!time_left) {
> +                ovsrcu_quiesce();
> +            } else {
> +                ovsrcu_quiesce_start();
> +            }
> +        }
> +
> +        retval = epoll_wait(epoll_fd, events, max, time_left);
> +        if (retval < 0) {
> +            retval = -errno;
> +        }
> +
> +        if (!quiescent && time_left) {
> +            ovsrcu_quiesce_end();
> +        }
> +
> +        if (deadline <= time_msec()) {
> +            fatal_signal_handler(SIGALRM);
> +            if (retval < 0) {
> +                retval = 0;
> +            }
> +            break;
> +        }
> +
> +        if (retval != -EINTR) {
> +            break;
> +        }
> +    }
> +    *last_wakeup = time_msec();
> +    refresh_rusage();
> +    *elapsed = *last_wakeup - start;
> +    return retval;
> +}
> +#endif
> +
>   long long int
>   timespec_to_msec(const struct timespec *ts)
>   {
> diff --git a/lib/timeval.h b/lib/timeval.h
> index 502f703d4..347a09d63 100644
> --- a/lib/timeval.h
> +++ b/lib/timeval.h
> @@ -20,6 +20,9 @@
>   #include <time.h>
>   #include "openvswitch/type-props.h"
>   #include "util.h"
> +#ifdef __linux__
> +#include <sys/epoll.h>
> +#endif
>   
>   #ifdef  __cplusplus
>   extern "C" {
> @@ -61,6 +64,10 @@ void time_wall_timespec(struct timespec *);
>   void time_alarm(unsigned int secs);
>   int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
>                 long long int timeout_when, int *elapsed);
> +#ifdef __linux__
> +int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
> +          long long int timeout_when, int *elapsed);
> +#endif
>   
>   long long int timespec_to_msec(const struct timespec *);
>   long long int timespec_to_usec(const struct timespec *);
>
0-day Robot Feb. 14, 2020, 7:06 p.m. UTC | #2
Bleep bloop.  Greetings Anton Ivanov, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line has trailing whitespace
#71 FILE: include/openvswitch/poll-loop.h:67:
#endif 

WARNING: Line has trailing whitespace
#92 FILE: include/openvswitch/poll-loop.h:93:
 * containing the events passed by the OS in .revents. 

WARNING: Line has trailing whitespace
#95 FILE: include/openvswitch/poll-loop.h:96:
 * Limitations - MUST BE registered from the same thread as the one where 

WARNING: Line is 86 characters long (recommended limit is 79)
#99 FILE: include/openvswitch/poll-loop.h:100:
void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);

WARNING: Line is 100 characters long (recommended limit is 79)
#100 FILE: include/openvswitch/poll-loop.h:101:
#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)

WARNING: Line is 96 characters long (recommended limit is 79)
#115 FILE: include/openvswitch/poll-loop.h:116:
#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)

WARNING: Line is 93 characters long (recommended limit is 79)
#242 FILE: lib/poll-loop.c:55:
    bool private;               /* Can we assume that it is only in this thread poll loop? */

WARNING: Line is 86 characters long (recommended limit is 79)
#268 FILE: lib/poll-loop.c:79:
/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */

WARNING: Line is 83 characters long (recommended limit is 79)
#285 FILE: lib/poll-loop.c:99:
/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN

WARNING: Line is 88 characters long (recommended limit is 79)
#286 FILE: lib/poll-loop.c:100:
 * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will

WARNING: Line is 90 characters long (recommended limit is 79)
#294 FILE: lib/poll-loop.c:103:
 * The event registration is PERSISTENT. This is intended for OSes which have a persistent

WARNING: Line is 110 characters long (recommended limit is 79)
#309 FILE: lib/poll-loop.c:114:
poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)

WARNING: Line is 97 characters long (recommended limit is 79)
#345 FILE: lib/poll-loop.c:137:
        /* If there is an existing event mask we do not need to inc - this will be waited upon */

WARNING: Line is 90 characters long (recommended limit is 79)
#346 FILE: lib/poll-loop.c:138:
        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */

WARNING: Line is 82 characters long (recommended limit is 79)
#349 FILE: lib/poll-loop.c:141:
        /* modify existing epoll entry if there is an epoll specific ask or if the

WARNING: Line is 80 characters long (recommended limit is 79)
#353 FILE: lib/poll-loop.c:145:
            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;

WARNING: Line is 93 characters long (recommended limit is 79)
#364 FILE: lib/poll-loop.c:156:
            /* node marked for reaping, OS has reused the fd number, valid is set to false */

WARNING: Line is 103 characters long (recommended limit is 79)
#376 FILE: lib/poll-loop.c:168:
        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */

WARNING: Line is 82 characters long (recommended limit is 79)
#388 FILE: lib/poll-loop.c:180:
poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {

WARNING: Line is 89 characters long (recommended limit is 79)
#392 FILE: lib/poll-loop.c:184:
/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)

WARNING: Line is 89 characters long (recommended limit is 79)
#394 FILE: lib/poll-loop.c:186:
 * In order to be compatible with existing calling conventions while using fd persistence

WARNING: Line is 86 characters long (recommended limit is 79)
#395 FILE: lib/poll-loop.c:187:
 * where supported we have to keep "legacy" fds around for the duration of the life of

WARNING: Line is 93 characters long (recommended limit is 79)
#397 FILE: lib/poll-loop.c:189:
 * The reason for this is that for some of them the close() is in a thread different from the

WARNING: Line is 93 characters long (recommended limit is 79)
#399 FILE: lib/poll-loop.c:191:
 * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the

WARNING: Line is 87 characters long (recommended limit is 79)
#419 FILE: lib/poll-loop.c:211:
            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);

WARNING: Line is 98 characters long (recommended limit is 79)
#476 FILE: lib/poll-loop.c:363:
        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/

WARNING: Line is 100 characters long (recommended limit is 79)
#532 FILE: lib/poll-loop.c:437:
        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);

WARNING: Line is 84 characters long (recommended limit is 79)
#554 FILE: lib/poll-loop.c:459:
                 * ensured that anything which refers to IO event hints will not run

WARNING: Line is 84 characters long (recommended limit is 79)
#560 FILE: lib/poll-loop.c:465:
                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);

WARNING: Line is 101 characters long (recommended limit is 79)
#564 FILE: lib/poll-loop.c:469:
                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */

WARNING: Line is 82 characters long (recommended limit is 79)
#567 FILE: lib/poll-loop.c:472:
                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);

WARNING: Line is 105 characters long (recommended limit is 79)
#634 FILE: lib/poll-loop.c:520:
                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);

WARNING: Line has trailing whitespace
#643 FILE: lib/poll-loop.c:529:
                /* update "requested" events. 

WARNING: Line is 96 characters long (recommended limit is 79)
#644 FILE: lib/poll-loop.c:530:
                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc

WARNING: Line is 92 characters long (recommended limit is 79)
#645 FILE: lib/poll-loop.c:531:
                 * behaviour which they should be using in real life instead of using poll()

WARNING: Line is 81 characters long (recommended limit is 79)
#648 FILE: lib/poll-loop.c:534:
                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));

WARNING: Line is 89 characters long (recommended limit is 79)
#652 FILE: lib/poll-loop.c:538:
                /* update "occured" events for use by streams and handlers. In case there

WARNING: Line is 87 characters long (recommended limit is 79)
#653 FILE: lib/poll-loop.c:539:
                 * is an existing (but not consumed yet) event, we OR the events in the

WARNING: Line is 89 characters long (recommended limit is 79)
#654 FILE: lib/poll-loop.c:540:
                 * stored record with the new ones - it is the job of the stream to clear

ERROR: Inappropriate bracing around statement
#665 FILE: lib/poll-loop.c:549:
    if (wevents)

WARNING: Line is 93 characters long (recommended limit is 79)
#742 FILE: lib/stream-fd.c:119:
        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight

WARNING: Line is 89 characters long (recommended limit is 79)
#743 FILE: lib/stream-fd.c:120:
         * to the read which should return 0 if the HUP is a real one, if not we clear it

WARNING: Line is 83 characters long (recommended limit is 79)
WARNING: Line lacks whitespace around operator
#746 FILE: lib/stream-fd.c:123:
        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {

WARNING: Line lacks whitespace around operator
#755 FILE: lib/stream-fd.c:132:
            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);

WARNING: Line is 89 characters long (recommended limit is 79)
#791 FILE: lib/stream-fd.c:177:
        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen

WARNING: Line has trailing whitespace
#792 FILE: lib/stream-fd.c:178:
         *  on unix domain sockets 

WARNING: Line is 93 characters long (recommended limit is 79)
#884 FILE: lib/stream-ssl.c:705:
        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight

WARNING: Line is 89 characters long (recommended limit is 79)
#885 FILE: lib/stream-ssl.c:706:
         * to the read which should return 0 if the HUP is a real one, if not we clear it

WARNING: Line is 102 characters long (recommended limit is 79)
WARNING: Line lacks whitespace around operator
#888 FILE: lib/stream-ssl.c:709:
        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {

WARNING: Line has trailing whitespace
#892 FILE: lib/stream-ssl.c:713:
                /* POLLIN event from poll loop, mark us as ready 

WARNING: Line lacks whitespace around operator
#898 FILE: lib/stream-ssl.c:719:
            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);

WARNING: Line has trailing whitespace
#917 FILE: lib/stream-ssl.c:763:
                /* POLLIN event from poll loop, mark us as ready 

Lines checked: 1114, Warnings: 53, Errors: 1


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
Dumitru Ceara Feb. 17, 2020, 2:48 p.m. UTC | #3
On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> 1. Adds "persistent" behaviour where feasible (streams and signals).
> These are waited upon in the same thread where they are created. This
> allows them to be registered persistently with the OS (if possible)
> as well as the OS to provide hints - is the FD ready, is it closed,
> etc.
> 
> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
> which is not ready if that fd has been registered as "private" to the
> thread which waits upon it.
> 
> 3. No longer breaks other parts of OVS which create the fd in one
> thread and waits upon it in others.
> 
> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
> poll++ frameworks in other OSes.
> 
> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
> using a "centeral (e)poll dispatcher + IO threads" pattern
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>

Hi Anton,

A couple of issues inline. Except for that:

1. The "STP - flush the fdb and mdb when topology changed" OVS test is
failing with your patches applied:

make check TESTSUITEFLAGS='-k "flush the fdb"'

2. Travis CI build fails:

lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
declared with attribute warn_unused_result [-Werror=unused-result]

     read(signal_fds[0], sigbuffer, sizeof(sigbuffer));

3. Travis CI OSX build fails:

lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
[-Werror,-Wunused-function]

COVERAGE_DEFINE(poll_create_node);

4. While OVS might benefit from these changes I'm wondering about OVN
and ovsdb-server specifically. ovsdb-server is single threaded and
usually on large scale deployments we don't really see "poll" as the
bottleneck or even the fact that code tries to read/write from FDs when
FDs are not available for read/write.

For example, here are results of running a scale test scenario which
repeats the following iteration 300 times:
- bring up a node (ovn-fake-multinode container) and connect it to the
OVN Southbound DB.
- configure an OVN logical switch to be bound to the new node.
- configure an OVN logical switch port on the new logical switch.
- configure an OVS internal interface on the new node and bind it to the
OVN logical switch port.
- wait until the new internal interface can ping its default gateway
through OVN (i.e., until ovn-controller on the node received all updates
from the SB DB and installed all OVS flows), highlighted in the output.

The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
running OVN ovsdb-servers and ovn-northd and 8 machines simulating
chassis using ovn-fake-multinode), in particular this modified scenario:
https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json

With OVS master and OVN master:
http://pastebin.test.redhat.com/836568

With OVS master + your patches and OVN master:
http://pastebin.test.redhat.com/836571

Here are some of the logs we get on the OVN Southbound DB ovsdb-server
that show that ovsdb-server spends up to 2 seconds in a single loop
iteration sending/receiving updates to/from ovn-controllers:

2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
(84% CPU usage)
2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
poll interval (2144ms user, 9ms system)
2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
voluntary, 4 involuntary
2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
last 2 seconds (most recently, 2 seconds ago) due to excessive rate
2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
(84% CPU usage)
2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
poll interval (2129ms user, 17ms system)
2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
voluntary, 7 involuntary
2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
poll interval (2136ms user, 10ms system)

In this case, and I think in most OVN use cases, ovsdb-server is busy
because it actually has to send updates to large numbers of
ovn-controllers connected to it. Unless I'm missing something the epoll
change seems to improve performance only in cases where the Southbound
DB doesn't do much sending/receiving. How do you test
performance/scalability improvements?

Regards,
Dumitru

> ---
>  include/openvswitch/poll-loop.h |  56 +++++-
>  lib/dpif-netlink.c              |   6 +-
>  lib/fatal-signal.c              |   7 +-
>  lib/latch-unix.c                |   3 +-
>  lib/netdev-afxdp.c              |   2 +-
>  lib/poll-loop.c                 | 320 ++++++++++++++++++++++++--------
>  lib/route-table-bsd.c           |   1 +
>  lib/stream-fd.c                 |  62 ++++++-
>  lib/stream-ssl.c                |  50 ++++-
>  lib/timeval.c                   |  83 +++++++++
>  lib/timeval.h                   |   7 +
>  11 files changed, 508 insertions(+), 89 deletions(-)
> 
> diff --git a/include/openvswitch/poll-loop.h b/include/openvswitch/poll-loop.h
> index 532d9caa6..6d0331f6d 100644
> --- a/include/openvswitch/poll-loop.h
> +++ b/include/openvswitch/poll-loop.h
> @@ -41,11 +41,30 @@
>  #include <windows.h>
>  #endif
>  
> +#ifdef __linux__
> +#define OVS_USE_EPOLL
> +#endif
> +
> +#ifdef OVS_USE_EPOLL
> +#include <sys/epoll.h>
> +
> +#define OVS_POLLIN EPOLLIN
> +#define OVS_POLLOUT EPOLLOUT
> +#define OVS_POLLERR EPOLLERR
> +#define OVS_POLLHUP EPOLLHUP
> +#define OVS_ONESHOT EPOLLONESHOT
> +#define OVS_POLLNVAL 0
> +
> +#else
> +
>  #define OVS_POLLIN POLLIN
>  #define OVS_POLLOUT POLLOUT
>  #define OVS_POLLERR POLLERR
>  #define OVS_POLLNVAL POLLNVAL
>  #define OVS_POLLHUP POLLHUP
> +#define OVS_ONESHOT (1U << 30)
> +
> +#endif 
>  
>  #ifdef  __cplusplus
>  extern "C" {
> @@ -60,10 +79,43 @@ extern "C" {
>   * the source code location of the caller.  The function version allows the
>   * caller to supply a location explicitly, which is useful if the caller's own
>   * caller would be more useful in log output.  See timer_wait_at() for an
> - * example. */
> -void poll_fd_wait_at(int fd, short int events, const char *where);
> + * example.
> + * Note - using on fds registered using poll_fd_register() will generate a
> + * warning as this is not an intended use.
> + */
> +void poll_fd_wait_at(int fd, int events, const char *where);
>  #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>  
> +/* Register a fd with a persistence framework if available so it can be served
> + * "faster" and the caller can be provided with "hints" on what caused the IO
> + * event.
> + * If the "hint" argument is supplied it set to point to the pollfd structure
> + * containing the events passed by the OS in .revents. 
> + * Note - as the frameworks are OS dependent, the events are limited to what
> + * can be passed in a .revents which is a short int.
> + * Limitations - MUST BE registered from the same thread as the one where 
> + * it will be waited upon.
> + */
> +
> +void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);
> +#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)
> +
> +/* De-register a fd which was registered as "private" with the persistence
> + * framework
> + */
> +
> +void poll_fd_deregister_at(int fd, const char *where);
> +#define poll_fd_deregister(fd) poll_fd_deregister_at(fd, OVS_SOURCE_LOCATOR)
> +
> +/* Schedule events to wake up the following poll_block() - "private fds"
> + * Same as poll_fd_wait, but for fds which have been registered and are
> + * expected to persist. If a "fast" OS fd notification framework is used
> + * this version of wait may be a NOOP (f.e. for (E)POLLIN events.
> + */
> +void private_poll_fd_wait_at(int fd, int events, const char *where);
> +#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
> +
> +
>  #ifdef _WIN32
>  void poll_wevent_wait_at(HANDLE wevent, const char *where);
>  #define poll_wevent_wait(wevent) poll_wevent_wait_at(wevent, OVS_SOURCE_LOCATOR)
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 5b5c96d72..ad5db9452 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -1289,7 +1289,7 @@ dpif_netlink_port_poll_wait(const struct dpif *dpif_)
>      const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>  
>      if (dpif->port_notifier) {
> -        nl_sock_wait(dpif->port_notifier, POLLIN);
> +        nl_sock_wait(dpif->port_notifier, OVS_POLLIN);
>      } else {
>          poll_immediate_wake();
>      }
> @@ -2756,13 +2756,13 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
>      }
>  
>      for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
> -        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
> +        nl_sock_wait(sock_pool[i].nl_sock, OVS_POLLIN);
>      }
>  #else
>      if (dpif->handlers && handler_id < dpif->n_handlers) {
>          struct dpif_handler *handler = &dpif->handlers[handler_id];
>  
> -        poll_fd_wait(handler->epoll_fd, POLLIN);
> +        poll_fd_wait(handler->epoll_fd, OVS_POLLIN);
>      }
>  #endif
>  }
> diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
> index 97d8d1dab..424636e07 100644
> --- a/lib/fatal-signal.c
> +++ b/lib/fatal-signal.c
> @@ -96,6 +96,7 @@ fatal_signal_init(void)
>          ovs_mutex_init_recursive(&mutex);
>  #ifndef _WIN32
>          xpipe_nonblocking(signal_fds);
> +        poll_fd_register(signal_fds[0], OVS_POLLIN, NULL);
>  #else
>          wevent = CreateEvent(NULL, TRUE, FALSE, NULL);
>          if (!wevent) {
> @@ -236,9 +237,12 @@ void
>  fatal_signal_run(void)
>  {
>      sig_atomic_t sig_nr;
> +    char sigbuffer[_POSIX_PIPE_BUF];
>  
>      fatal_signal_init();
>  
> +    read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
> +
>      sig_nr = stored_sig_nr;
>      if (sig_nr != SIG_ATOMIC_MAX) {
>          char namebuf[SIGNAL_NAME_BUFSIZE];
> @@ -271,7 +275,8 @@ fatal_signal_wait(void)
>  #ifdef _WIN32
>      poll_wevent_wait(wevent);
>  #else
> -    poll_fd_wait(signal_fds[0], OVS_POLLIN);
> +    /* a noop - schedule for removal */
> +    private_poll_fd_wait(signal_fds[0], OVS_POLLIN);
>  #endif
>  }
>  
> diff --git a/lib/latch-unix.c b/lib/latch-unix.c
> index fea61ab28..5f15b59fe 100644
> --- a/lib/latch-unix.c
> +++ b/lib/latch-unix.c
> @@ -83,5 +83,6 @@ latch_is_set(const struct latch *latch)
>  void
>  latch_wait_at(const struct latch *latch, const char *where)
>  {
> -    poll_fd_wait_at(latch->fds[0], OVS_POLLIN, where);
> +    /* Ask for wait and make it one-shot if persistence is in play */
> +    poll_fd_wait_at(latch->fds[0], OVS_POLLIN | OVS_ONESHOT, where);
>  }
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index ef367e5ea..482400d8d 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -184,7 +184,7 @@ xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
>  
>      if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
>          pfd.fd = fd;
> -        pfd.events = OVS_POLLIN;
> +        pfd.events = POLLIN;
>  
>          ret = poll(&pfd, 1, 0);
>          if (OVS_UNLIKELY(ret < 0)) {
> diff --git a/lib/poll-loop.c b/lib/poll-loop.c
> index 3902d6c1f..10a5b0c01 100644
> --- a/lib/poll-loop.c
> +++ b/lib/poll-loop.c
> @@ -18,6 +18,12 @@
>  #include "openvswitch/poll-loop.h"
>  #include <errno.h>
>  #include <inttypes.h>
> +#ifdef OVS_USE_EPOLL
> +#include <sys/epoll.h>
> +#endif
> +#ifndef _WIN32
> +#include <unistd.h>
> +#endif
>  #include <poll.h>
>  #include <stdlib.h>
>  #include <string.h>
> @@ -31,7 +37,9 @@
>  #include "timeval.h"
>  #include "openvswitch/vlog.h"
>  #include "openvswitch/hmap.h"
> +#include "openvswitch/list.h"
>  #include "hash.h"
> +#include "ovs-atomic.h"
>  
>  VLOG_DEFINE_THIS_MODULE(poll_loop);
>  
> @@ -43,21 +51,32 @@ struct poll_node {
>      struct pollfd pollfd;       /* Events to pass to time_poll(). */
>      HANDLE wevent;              /* Events for WaitForMultipleObjects(). */
>      const char *where;          /* Where poll_node was created. */
> +    bool valid;                 /* Can it be used? */
> +    bool private;               /* Can we assume that it is only in this thread poll loop? */
>  };
>  
> +#define MAX_EPOLL_EVENTS 64
> +
>  struct poll_loop {
> -    /* All active poll waiters. */
> +    /* List of all poll loops in the system */
> +    struct ovs_mutex loop_mutex;
> +    /* All poll waiters for this poll loop */
>      struct hmap poll_nodes;
>  
>      /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
>       * wake up immediately, or LLONG_MAX to wait forever. */
>      long long int timeout_when; /* In msecs as returned by time_msec(). */
>      const char *timeout_where;  /* Where 'timeout_when' was set. */
> +#ifdef OVS_USE_EPOLL
> +    int epoll_fd;
> +    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
> +#endif
>  };
>  
> +
>  static struct poll_loop *poll_loop(void);
>  
> -/* Look up the node with same fd or wevent. */
> +/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */
>  static struct poll_node *
>  find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>  {
> @@ -76,79 +95,142 @@ find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>      }
>      return NULL;
>  }
> -
> -/* On Unix based systems:
> - *
> - *     Registers 'fd' as waiting for the specified 'events' (which should be
> - *     OVS_POLLIN or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to
> - *     poll_block() will wake up when 'fd' becomes ready for one or more of the
> - *     requested events. The 'fd's are given to poll() function later.
> - *
> - * On Windows system:
> +/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
> + * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
> + * wake up when 'fd' becomes ready for one or more of the requested events.
>   *
> - *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
> - *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
> - *     it is assumed that it is unrelated to any sockets and poll_block()
> - *     will wake up on any event on that 'wevent'. It is an error to pass
> - *     both 'wevent' and 'fd'.
> + * The event registration is PERSISTENT. This is intended for OSes which have a persistent
> + * event framework. For now it is implemented only for epoll and Linux, other
> + * implementations such as BSD kqueue and Solaris /dev/poll may follow.
>   *
> - * The event registration is one-shot: only the following call to
> - * poll_block() is affected.  The event will need to be re-registered after
> - * poll_block() is called if it is to persist.
> + * If the OS has no persistent even framework does nothing
>   *
>   * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>   * automatically provide the caller's source file and line number for
>   * 'where'.) */
> +
>  static void
> -poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
> +poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)
>  {
>      struct poll_loop *loop = poll_loop();
>      struct poll_node *node;
> +#ifdef OVS_USE_EPOLL
> +    struct epoll_event event;
> +#endif
>  
> -    COVERAGE_INC(poll_create_node);
> -
> -    /* Both 'fd' and 'wevent' cannot be set. */
>      ovs_assert(!fd != !wevent);
>  
> +    /* This is mostly uncontended, so the thread should grab it straight away.
> +     * We will reuse it later to introduce threading for IO and SSL
> +     */
> +    ovs_mutex_lock(&loop->loop_mutex);
> +
>      /* Check for duplicate.  If found, "or" the events. */
>      node = find_poll_node(loop, fd, wevent);
> -    if (node) {
> -        node->pollfd.events |= events;
> -    } else {
> -        node = xzalloc(sizeof *node);
> -        hmap_insert(&loop->poll_nodes, &node->hmap_node,
> -                    hash_2words(fd, (uint32_t)wevent));
> -        node->pollfd.fd = fd;
> -        node->pollfd.events = events;
> -#ifdef _WIN32
> -        if (!wevent) {
> -            wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
> +
> +    if (node && node->valid) {
> +#ifdef OVS_USE_EPOLL
> +        int old_event_mask = node->pollfd.events;
> +#endif
> +        /* If there is an existing event mask we do not need to inc - this will be waited upon */
> +        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */
> +
> +#ifdef OVS_USE_EPOLL
> +        /* modify existing epoll entry if there is an epoll specific ask or if the
> +         * mask has changed
> +         */
> +        if ((events & 0xFFFF0000) || (old_event_mask != node->pollfd.events)) {
> +            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;
> +            event.data.ptr = node;
> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
>          }
>  #endif
> +    } else {
> +        if (!node) {
> +            node = xzalloc(sizeof *node);
> +            hmap_insert(&loop->poll_nodes, &node->hmap_node,
> +                        hash_2words(fd, 0));
> +        } else {
> +            /* node marked for reaping, OS has reused the fd number, valid is set to false */
> +#ifdef OVS_USE_EPOLl

This should be "#ifdef OVS_USE_EPOLL"

> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, fd, NULL);
> +#endif
> +        }
> +        node->pollfd.fd = fd;
> +        node->pollfd.events = (events & 0x0000FFFF);
>          node->wevent = wevent;
>          node->where = where;
> +        node->valid = true;
> +        node->private = private;
> +#ifdef OVS_USE_EPOLL
> +        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */
> +        event.data.ptr = node;
> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
> +#endif
> +    }
> +    if (hint) {
> +        *hint = &node->pollfd;
>      }
> +    ovs_mutex_unlock(&loop->loop_mutex);
> +}
> +
> +void
> +poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {
> +    poll_fd_subscribe_at(fd, 0, events, hint, where , true);
> +}
> +
> +/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)
> + * but it is not.
> + * In order to be compatible with existing calling conventions while using fd persistence
> + * where supported we have to keep "legacy" fds around for the duration of the life of
> + * the thread because we have no idea if they have been reaped properly or not.
> + * The reason for this is that for some of them the close() is in a thread different from the
> + * poll loop.
> + * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the
> + * same fd number, we will reuse the existing has entry.
> + */
> +
> +void
> +poll_fd_deregister_at(int fd, const char *where) {
> +    struct poll_loop *loop = poll_loop();
> +
> +    VLOG(VLL_DBG, "Deregister %d from %s", fd, where);
> +    struct poll_node *node;
> +
> +    ovs_mutex_lock(&loop->loop_mutex);
> +    node = find_poll_node(loop, fd, 0);
> +    if (node) {
> +        if (node->private) {
> +#ifdef OVN_USE_EPOLL

This should be "#ifdef OVS_USE_EPOLL".

> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
> +#endif
> +            hmap_remove(&loop->poll_nodes, &node->hmap_node);
> +        } else {
> +            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);
> +            node->valid = false;
> +        }
> +    }
> +    ovs_mutex_unlock(&loop->loop_mutex);
> +}
> +
> +void
> +poll_fd_wait_at(int fd, int events, const char *where)
> +{
> +    poll_fd_subscribe_at(fd, 0, events, NULL, where, false);
>  }
>  
> -/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
> - * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
> - * wake up when 'fd' becomes ready for one or more of the requested events.
> - *
> - * On Windows, 'fd' must be a socket.
> - *
> - * The event registration is one-shot: only the following call to poll_block()
> - * is affected.  The event will need to be re-registered after poll_block() is
> - * called if it is to persist.
> - *
> - * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
> - * automatically provide the caller's source file and line number for
> - * 'where'.) */
>  void
> -poll_fd_wait_at(int fd, short int events, const char *where)
> +private_poll_fd_wait_at(int fd, int events, const char *where)
>  {
> -    poll_create_node(fd, 0, events, where);
> +    /* POLLIN persists on "private" fds - either emulated or at epoll
> +     * or other persistence framework level
> +     */
> +    if (events & (~OVS_POLLIN)) {
> +        poll_fd_subscribe_at(fd, 0, events, NULL, where, true);
> +    }
>  }
>  
> +
>  #ifdef _WIN32
>  /* Registers for the next call to poll_block() to wake up when 'wevent' is
>   * signaled.
> @@ -163,7 +245,7 @@ poll_fd_wait_at(int fd, short int events, const char *where)
>  void
>  poll_wevent_wait_at(HANDLE wevent, const char *where)
>  {
> -    poll_create_node(0, wevent, 0, where);
> +    poll_fd_subscribe_at(0, wevent, 0, NULL, where);
>  }
>  #endif /* _WIN32 */
>  
> @@ -277,9 +359,12 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>          if (pollfd->revents & OVS_POLLHUP) {
>              ds_put_cstr(&s, "[OVS_POLLHUP]");
>          }
> +#ifndef OVS_USE_EPOLL
> +        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/
>          if (pollfd->revents & OVS_POLLNVAL) {
>              ds_put_cstr(&s, "[OVS_POLLNVAL]");
>          }
> +#endif
>          ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
>          free(description);
>      } else {
> @@ -295,12 +380,17 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>      ds_destroy(&s);
>  }
>  
> +
>  static void
>  free_poll_nodes(struct poll_loop *loop)
>  {
>      struct poll_node *node, *next;
>  
> +    ovs_mutex_lock(&loop->loop_mutex);
>      HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
> +#ifdef OVS_USE_EPOLL
> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
> +#endif
>          hmap_remove(&loop->poll_nodes, &node->hmap_node);
>  #ifdef _WIN32
>          if (node->wevent && node->pollfd.fd) {
> @@ -310,6 +400,7 @@ free_poll_nodes(struct poll_loop *loop)
>  #endif
>          free(node);
>      }
> +    ovs_mutex_unlock(&loop->loop_mutex);
>  }
>  
>  /* Blocks until one or more of the events registered with poll_fd_wait()
> @@ -320,8 +411,13 @@ poll_block(void)
>  {
>      struct poll_loop *loop = poll_loop();
>      struct poll_node *node;
> +#ifndef OVS_USE_EPOLL
>      struct pollfd *pollfds;
> +#endif
> +#ifndef OVS_USE_EPOLL
>      HANDLE *wevents = NULL;
> +    int counter;
> +#endif
>      int elapsed;
>      int retval;
>      int i;
> @@ -335,54 +431,126 @@ poll_block(void)
>      }
>  
>      timewarp_run();
> -    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>  
> +#ifdef OVS_USE_EPOLL
> +    retval = time_epoll_wait(loop->epoll_fd,
> +        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
> +    if (retval < 0) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_ERR_RL(&rl, "epoll: %s", ovs_strerror(retval));
> +    } else if (!retval) {
> +        log_wakeup(loop->timeout_where, NULL, elapsed);
> +    } else {
> +        ovs_mutex_lock(&loop->loop_mutex);
> +        if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> +            for (i = 0; i < retval; i++) {
> +                node = (struct poll_node *) loop->epoll_events[i].data.ptr;
> +                if (loop->epoll_events[i].events) {
> +                    node->pollfd.revents = loop->epoll_events[i].events;
> +                    log_wakeup(node->where, &node->pollfd, 0);
> +                }
> +            }
> +        }
> +        for (i = 0; i < retval; i++) {
> +            node = (struct poll_node *) loop->epoll_events[i].data.ptr;
> +            if (loop->epoll_events[i].events & EPOLLHUP) {
> +                /* File descriptor closed already elsewhere
> +                 * We have to make the assumption that whoever closed it has
> +                 * ensured that anything which refers to IO event hints will not run
> +                 * on this fd after we free it.
> +                 */
> +                node->valid = false;
> +            }
> +            if (loop->epoll_events[i].events) {
> +                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);
> +            }
> +            if (loop->epoll_events[i].events & OVS_POLLOUT) {
> +                struct epoll_event event;
> +                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */
> +                event.events = node->pollfd.events;
> +                event.data.ptr = node;
> +                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);
> +            }
> +        }
> +        ovs_mutex_unlock(&loop->loop_mutex);
> +    }
> +#else
> +    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>  #ifdef _WIN32
>      wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
>  #endif
>  
> +
>      /* Populate with all the fds and events. */
> -    i = 0;
> +    counter = 0;
>      HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
> -        pollfds[i] = node->pollfd;
> +        if ((node->valid) && (node->pollfd.events)) {
> +            pollfds[counter] = node->pollfd;
>  #ifdef _WIN32
> -        wevents[i] = node->wevent;
> -        if (node->pollfd.fd && node->wevent) {
> -            short int wsa_events = 0;
> -            if (node->pollfd.events & OVS_POLLIN) {
> -                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
> +            wevents[counter] = node->wevent;
> +            if (node->pollfd.fd && node->wevent) {
> +                short int wsa_events = 0;
> +                if (node->pollfd.events & OVS_POLLIN) {
> +                    wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
> +                }
> +                if (node->pollfd.events & OVS_POLLOUT) {
> +                    wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
> +                }
> +                WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>              }
> -            if (node->pollfd.events & OVS_POLLOUT) {
> -                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
> -            }
> -            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
> -        }
>  #endif
> -        i++;
> +            counter++;
> +        }
>      }
>  
> -    retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
> +    retval = time_poll(pollfds, counter, wevents,
>                         loop->timeout_when, &elapsed);
>      if (retval < 0) {
>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>          VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
> -    } else if (!retval) {
> +    } else if (retval == 0) {
>          log_wakeup(loop->timeout_where, NULL, elapsed);
> -    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> -        i = 0;
> -        HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
> +    } else {
> +        for (i = 0; i < counter; i++) {
>              if (pollfds[i].revents) {
> -                log_wakeup(node->where, &pollfds[i], 0);
> +
> +                node = find_poll_node(loop, pollfds[i].fd, 0);
> +
> +                if (!node) {
> +                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
> +                }
> +                if (pollfds[i].revents & (OVS_POLLHUP | OVS_POLLNVAL)) {
> +                    node->valid = false;
> +                }
> +
> +                if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
> +                    log_wakeup(node->where, &pollfds[i], 0);
> +                }
> +                /* update "requested" events. 
> +                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc
> +                 * behaviour which they should be using in real life instead of using poll()
> +                 */
> +                if (node->private) {
> +                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));
> +                } else {
> +                    node->pollfd.events &= ~pollfds[i].revents;
> +                }
> +                /* update "occured" events for use by streams and handlers. In case there
> +                 * is an existing (but not consumed yet) event, we OR the events in the
> +                 * stored record with the new ones - it is the job of the stream to clear
> +                 * that.
> +                 */
> +                node->pollfd.revents |= pollfds[i].revents;
>              }
> -            i++;
>          }
>      }
>  
> -    free_poll_nodes(loop);
> +    free(pollfds);
> +    if (wevents)
> +        free(wevents);
> +#endif
>      loop->timeout_when = LLONG_MAX;
>      loop->timeout_where = NULL;
> -    free(pollfds);
> -    free(wevents);
>  
>      /* Handle any pending signals before doing anything else. */
>      fatal_signal_run();
> @@ -416,8 +584,12 @@ poll_loop(void)
>      if (!loop) {
>          loop = xzalloc(sizeof *loop);
>          loop->timeout_when = LLONG_MAX;
> +        ovs_mutex_init(&loop->loop_mutex);
>          hmap_init(&loop->poll_nodes);
>          xpthread_setspecific(key, loop);
> +#ifdef OVS_USE_EPOLL
> +        loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
> +#endif
>      }
>      return loop;
>  }
> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
> index 3dfa80c7f..16d155989 100644
> --- a/lib/route-table-bsd.c
> +++ b/lib/route-table-bsd.c
> @@ -34,6 +34,7 @@
>  #include "ovs-router.h"
>  #include "packets.h"
>  #include "openvswitch/vlog.h"
> +#include "openvswitch/poll-loop.h"
>  #include "util.h"
>  
>  VLOG_DEFINE_THIS_MODULE(route_table_bsd);
> diff --git a/lib/stream-fd.c b/lib/stream-fd.c
> index 62f768d45..6a80d6e05 100644
> --- a/lib/stream-fd.c
> +++ b/lib/stream-fd.c
> @@ -40,6 +40,8 @@ struct stream_fd
>      struct stream stream;
>      int fd;
>      int fd_type;
> +    bool rx_ready, tx_ready;
> +    struct pollfd *hint;
>  };
>  
>  static const struct stream_class stream_fd_class;
> @@ -67,7 +69,14 @@ new_fd_stream(char *name, int fd, int connect_status, int fd_type,
>      stream_init(&s->stream, &stream_fd_class, connect_status, name);
>      s->fd = fd;
>      s->fd_type = fd_type;
> +    s->rx_ready = true;
> +    s->tx_ready = true;
> +    s->hint = NULL;
>      *streamp = &s->stream;
> +    /* Persistent registration - we always get POLLINs from now on,
> +     * POLLOUTs when we ask for them
> +     */
> +    poll_fd_register(s->fd, OVS_POLLIN, &s->hint);
>      return 0;
>  }
>  
> @@ -82,6 +91,8 @@ static void
>  fd_close(struct stream *stream)
>  {
>      struct stream_fd *s = stream_fd_cast(stream);
> +    /* Deregister the FD from any persistent registrations if supported */
> +    poll_fd_deregister(s->fd);
>      closesocket(s->fd);
>      free(s);
>  }
> @@ -104,6 +115,24 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>      ssize_t retval;
>      int error;
>  
> +    if (s->hint) {
> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
> +         * for all other cases we belive what (e)poll has fed us.
> +         */
> +        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {
> +            if (!(s->hint->revents & OVS_POLLIN)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready */
> +                s->rx_ready = true;
> +                s->hint->revents &= ~OVS_POLLIN;
> +            }
> +        } else {
> +            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
> +        }
> +    }
> +
>      retval = recv(s->fd, buffer, n, 0);
>      if (retval < 0) {
>          error = sock_errno();
> @@ -114,6 +143,8 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>  #endif
>          if (error != EAGAIN) {
>              VLOG_DBG_RL(&rl, "recv: %s", sock_strerror(error));
> +        } else {
> +            s->rx_ready = false;
>          }
>          return -error;
>      }
> @@ -127,9 +158,29 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>      ssize_t retval;
>      int error;
>  
> +    if (s->hint) {
> +        /* poll-loop is providing us with hints for IO */
> +        if (!s->tx_ready) {
> +            if (!(s->hint->revents & OVS_POLLOUT)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLOUT event from poll loop, mark us as ready */
> +                s->tx_ready = true;
> +                s->hint->revents &= ~OVS_POLLOUT;
> +            }
> +        }
> +    }
>      retval = send(s->fd, buffer, n, 0);
>      if (retval < 0) {
>          error = sock_errno();
> +#ifdef __linux__
> +        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen
> +         *  on unix domain sockets 
> +         */
> +        if (error == ENOBUFS) {
> +           error = EAGAIN;
> +        }
> +#endif
>  #ifdef _WIN32
>          if (error == WSAEWOULDBLOCK) {
>             error = EAGAIN;
> @@ -137,6 +188,8 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>  #endif
>          if (error != EAGAIN) {
>              VLOG_DBG_RL(&rl, "send: %s", sock_strerror(error));
> +        } else {
> +            s->tx_ready = false;
>          }
>          return -error;
>      }
> @@ -150,11 +203,11 @@ fd_wait(struct stream *stream, enum stream_wait_type wait)
>      switch (wait) {
>      case STREAM_CONNECT:
>      case STREAM_SEND:
> -        poll_fd_wait(s->fd, OVS_POLLOUT);
> +        private_poll_fd_wait(s->fd, OVS_POLLOUT);
>          break;
>  
>      case STREAM_RECV:
> -        poll_fd_wait(s->fd, OVS_POLLIN);
> +        private_poll_fd_wait(s->fd, OVS_POLLIN);
>          break;
>  
>      default:
> @@ -223,6 +276,8 @@ new_fd_pstream(char *name, int fd,
>      ps->accept_cb = accept_cb;
>      ps->unlink_path = unlink_path;
>      *pstreamp = &ps->pstream;
> +    /* persistent registration */
> +    poll_fd_register(ps->fd, OVS_POLLIN, NULL);
>      return 0;
>  }
>  
> @@ -230,6 +285,7 @@ static void
>  pfd_close(struct pstream *pstream)
>  {
>      struct fd_pstream *ps = fd_pstream_cast(pstream);
> +    poll_fd_deregister(ps->fd);
>      closesocket(ps->fd);
>      maybe_unlink_and_free(ps->unlink_path);
>      free(ps);
> @@ -271,7 +327,7 @@ static void
>  pfd_wait(struct pstream *pstream)
>  {
>      struct fd_pstream *ps = fd_pstream_cast(pstream);
> -    poll_fd_wait(ps->fd, OVS_POLLIN);
> +    private_poll_fd_wait(ps->fd, OVS_POLLIN);
>  }
>  
>  static const struct pstream_class fd_pstream_class = {
> diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
> index 3b7f9865e..53ae51c1b 100644
> --- a/lib/stream-ssl.c
> +++ b/lib/stream-ssl.c
> @@ -147,6 +147,7 @@ struct ssl_stream
>      /* A few bytes of header data in case SSL negotiation fails. */
>      uint8_t head[2];
>      short int n_head;
> +    struct pollfd *hint;
>  };
>  
>  /* SSL context created by ssl_init(). */
> @@ -310,6 +311,8 @@ new_ssl_stream(char *name, char *server_name, int fd, enum session_type type,
>          SSL_set_msg_callback_arg(ssl, sslv);
>      }
>  
> +
> +    poll_fd_register(sslv->fd, OVS_POLLIN, &sslv->hint);
>      *streamp = &sslv->stream;
>      free(server_name);
>      return 0;
> @@ -604,6 +607,7 @@ ssl_close(struct stream *stream)
>      ERR_clear_error();
>  
>      SSL_free(sslv->ssl);
> +    poll_fd_deregister(sslv->fd);
>      closesocket(sslv->fd);
>      free(sslv);
>  }
> @@ -697,6 +701,27 @@ ssl_recv(struct stream *stream, void *buffer, size_t n)
>      /* Behavior of zero-byte SSL_read is poorly defined. */
>      ovs_assert(n > 0);
>  
> +     if (sslv->hint) {
> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
> +         * for all other cases we belive what (e)poll has fed us.
> +         */
> +        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {
> +            if (!(sslv->hint->revents & OVS_POLLIN)) {
> +                return -EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready 
> +                 * rx_want is cleared further down by reading ssl fsm
> +                 */
> +                sslv->hint->revents &= ~OVS_POLLIN;
> +            }
> +        } else {
> +            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
> +        }
> +    }
> +
> +
> +
>      old_state = SSL_get_state(sslv->ssl);
>      ret = SSL_read(sslv->ssl, buffer, n);
>      if (old_state != SSL_get_state(sslv->ssl)) {
> @@ -729,6 +754,19 @@ ssl_do_tx(struct stream *stream)
>  {
>      struct ssl_stream *sslv = ssl_stream_cast(stream);
>  
> +     if (sslv->hint) {
> +        /* poll-loop is providing us with hints for IO */
> +        if (sslv->tx_want == SSL_WRITING) {
> +            if (!(sslv->hint->revents & OVS_POLLOUT)) {
> +                return EAGAIN;
> +            } else {
> +                /* POLLIN event from poll loop, mark us as ready 
> +                 * rx_want is cleared further down by reading ssl fsm
> +                 */
> +                sslv->hint->revents &= ~OVS_POLLOUT;
> +            }
> +        }
> +    }
>      for (;;) {
>          int old_state = SSL_get_state(sslv->ssl);
>          int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
> @@ -771,6 +809,8 @@ ssl_send(struct stream *stream, const void *buffer, size_t n)
>              ssl_clear_txbuf(sslv);
>              return n;
>          case EAGAIN:
> +            /* we want to know when this fd will become available again */
> +            stream_send_wait(stream);
>              return n;
>          default:
>              ssl_clear_txbuf(sslv);
> @@ -795,7 +835,7 @@ ssl_run_wait(struct stream *stream)
>      struct ssl_stream *sslv = ssl_stream_cast(stream);
>  
>      if (sslv->tx_want != SSL_NOTHING) {
> -        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
> +        private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>      }
>  }
>  
> @@ -811,13 +851,13 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>          } else {
>              switch (sslv->state) {
>              case STATE_TCP_CONNECTING:
> -                poll_fd_wait(sslv->fd, OVS_POLLOUT);
> +                private_poll_fd_wait(sslv->fd, OVS_POLLOUT);
>                  break;
>  
>              case STATE_SSL_CONNECTING:
>                  /* ssl_connect() called SSL_accept() or SSL_connect(), which
>                   * set up the status that we test here. */
> -                poll_fd_wait(sslv->fd,
> +                private_poll_fd_wait(sslv->fd,
>                                 want_to_poll_events(SSL_want(sslv->ssl)));
>                  break;
>  
> @@ -829,7 +869,7 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>  
>      case STREAM_RECV:
>          if (sslv->rx_want != SSL_NOTHING) {
> -            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
> +            private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>          } else {
>              poll_immediate_wake();
>          }
> @@ -911,6 +951,7 @@ pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
>                   ds_steal_cstr(&bound_name));
>      pstream_set_bound_port(&pssl->pstream, htons(port));
>      pssl->fd = fd;
> +    poll_fd_register(fd, OVS_POLLIN, NULL);
>      *pstreamp = &pssl->pstream;
>  
>      return 0;
> @@ -920,6 +961,7 @@ static void
>  pssl_close(struct pstream *pstream)
>  {
>      struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
> +    poll_fd_deregister(pssl->fd);
>      closesocket(pssl->fd);
>      free(pssl);
>  }
> diff --git a/lib/timeval.c b/lib/timeval.c
> index 193c7bab1..59a12414f 100644
> --- a/lib/timeval.c
> +++ b/lib/timeval.c
> @@ -38,6 +38,7 @@
>  #include "unixctl.h"
>  #include "util.h"
>  #include "openvswitch/vlog.h"
> +#include "openvswitch/poll-loop.h"
>  
>  VLOG_DEFINE_THIS_MODULE(timeval);
>  
> @@ -369,6 +370,88 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED,
>      return retval;
>  }
>  
> +#ifdef OVS_USE_EPOLL
> +
> +/* Like epoll_wait(), except:
> + *
> + *      - The timeout is specified as an absolute time, as defined by
> + *        time_msec(), instead of a duration.
> + *
> + *      - On error, returns a negative error code (instead of setting errno).
> + *
> + *      - If interrupted by a signal, retries automatically until the original
> + *        timeout is reached.  (Because of this property, this function will
> + *        never return -EINTR.)
> + *
> + * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
> +int
> +time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
> +          long long int timeout_when, int *elapsed)
> +{
> +    long long int *last_wakeup = last_wakeup_get();
> +    long long int start;
> +    bool quiescent;
> +    int retval = 0;
> +
> +    time_init();
> +    coverage_clear();
> +    coverage_run();
> +    if (*last_wakeup && !thread_is_pmd()) {
> +        log_poll_interval(*last_wakeup);
> +    }
> +    start = time_msec();
> +
> +    timeout_when = MIN(timeout_when, deadline);
> +    quiescent = ovsrcu_is_quiescent();
> +
> +    for (;;) {
> +        long long int now = time_msec();
> +        int time_left;
> +
> +        if (now >= timeout_when) {
> +            time_left = 0;
> +        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
> +            time_left = INT_MAX;
> +        } else {
> +            time_left = timeout_when - now;
> +        }
> +
> +        if (!quiescent) {
> +            if (!time_left) {
> +                ovsrcu_quiesce();
> +            } else {
> +                ovsrcu_quiesce_start();
> +            }
> +        }
> +
> +        retval = epoll_wait(epoll_fd, events, max, time_left);
> +        if (retval < 0) {
> +            retval = -errno;
> +        }
> +
> +        if (!quiescent && time_left) {
> +            ovsrcu_quiesce_end();
> +        }
> +
> +        if (deadline <= time_msec()) {
> +            fatal_signal_handler(SIGALRM);
> +            if (retval < 0) {
> +                retval = 0;
> +            }
> +            break;
> +        }
> +
> +        if (retval != -EINTR) {
> +            break;
> +        }
> +    }
> +    *last_wakeup = time_msec();
> +    refresh_rusage();
> +    *elapsed = *last_wakeup - start;
> +    return retval;
> +}
> +#endif
> +
>  long long int
>  timespec_to_msec(const struct timespec *ts)
>  {
> diff --git a/lib/timeval.h b/lib/timeval.h
> index 502f703d4..347a09d63 100644
> --- a/lib/timeval.h
> +++ b/lib/timeval.h
> @@ -20,6 +20,9 @@
>  #include <time.h>
>  #include "openvswitch/type-props.h"
>  #include "util.h"
> +#ifdef __linux__
> +#include <sys/epoll.h>
> +#endif
>  
>  #ifdef  __cplusplus
>  extern "C" {
> @@ -61,6 +64,10 @@ void time_wall_timespec(struct timespec *);
>  void time_alarm(unsigned int secs);
>  int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
>                long long int timeout_when, int *elapsed);
> +#ifdef __linux__
> +int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
> +          long long int timeout_when, int *elapsed);
> +#endif
>  
>  long long int timespec_to_msec(const struct timespec *);
>  long long int timespec_to_usec(const struct timespec *);
>
Dumitru Ceara Feb. 17, 2020, 3:35 p.m. UTC | #4
On 2/17/20 3:48 PM, Dumitru Ceara wrote:
> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>> These are waited upon in the same thread where they are created. This
>> allows them to be registered persistently with the OS (if possible)
>> as well as the OS to provide hints - is the FD ready, is it closed,
>> etc.
>>
>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>> which is not ready if that fd has been registered as "private" to the
>> thread which waits upon it.
>>
>> 3. No longer breaks other parts of OVS which create the fd in one
>> thread and waits upon it in others.
>>
>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>> poll++ frameworks in other OSes.
>>
>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> Hi Anton,
> 
> A couple of issues inline. Except for that:
> 
> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
> failing with your patches applied:
> 
> make check TESTSUITEFLAGS='-k "flush the fdb"'
> 
> 2. Travis CI build fails:
> 
> lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
> declared with attribute warn_unused_result [-Werror=unused-result]
> 
>      read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
> 
> 3. Travis CI OSX build fails:
> 
> lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
> [-Werror,-Wunused-function]
> 
> COVERAGE_DEFINE(poll_create_node);
> 
> 4. While OVS might benefit from these changes I'm wondering about OVN
> and ovsdb-server specifically. ovsdb-server is single threaded and
> usually on large scale deployments we don't really see "poll" as the
> bottleneck or even the fact that code tries to read/write from FDs when
> FDs are not available for read/write.
> 
> For example, here are results of running a scale test scenario which
> repeats the following iteration 300 times:
> - bring up a node (ovn-fake-multinode container) and connect it to the
> OVN Southbound DB.
> - configure an OVN logical switch to be bound to the new node.
> - configure an OVN logical switch port on the new logical switch.
> - configure an OVS internal interface on the new node and bind it to the
> OVN logical switch port.
> - wait until the new internal interface can ping its default gateway
> through OVN (i.e., until ovn-controller on the node received all updates
> from the SB DB and installed all OVS flows), highlighted in the output.
> 
> The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
> running OVN ovsdb-servers and ovn-northd and 8 machines simulating
> chassis using ovn-fake-multinode), in particular this modified scenario:
> https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json
> 

Sorry, I just realized the pastebins were not public. Here's another try:

> With OVS master and OVN master:
> http://pastebin.test.redhat.com/836568

https://pastebin.com/P2Z7QtKx

> 
> With OVS master + your patches and OVN master:
> http://pastebin.test.redhat.com/836571

https://pastebin.com/Xq31a9az

> 
> Here are some of the logs we get on the OVN Southbound DB ovsdb-server
> that show that ovsdb-server spends up to 2 seconds in a single loop
> iteration sending/receiving updates to/from ovn-controllers:
> 
> 2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
> on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
> (84% CPU usage)
> 2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
> poll interval (2144ms user, 9ms system)
> 2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
> 2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
> 2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
> voluntary, 4 involuntary
> 2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
> last 2 seconds (most recently, 2 seconds ago) due to excessive rate
> 2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
> on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
> (84% CPU usage)
> 2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
> poll interval (2129ms user, 17ms system)
> 2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
> 2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
> voluntary, 7 involuntary
> 2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
> poll interval (2136ms user, 10ms system)
> 
> In this case, and I think in most OVN use cases, ovsdb-server is busy
> because it actually has to send updates to large numbers of
> ovn-controllers connected to it. Unless I'm missing something the epoll
> change seems to improve performance only in cases where the Southbound
> DB doesn't do much sending/receiving. How do you test
> performance/scalability improvements?
> 
> Regards,
> Dumitru
> 
>> ---
>>  include/openvswitch/poll-loop.h |  56 +++++-
>>  lib/dpif-netlink.c              |   6 +-
>>  lib/fatal-signal.c              |   7 +-
>>  lib/latch-unix.c                |   3 +-
>>  lib/netdev-afxdp.c              |   2 +-
>>  lib/poll-loop.c                 | 320 ++++++++++++++++++++++++--------
>>  lib/route-table-bsd.c           |   1 +
>>  lib/stream-fd.c                 |  62 ++++++-
>>  lib/stream-ssl.c                |  50 ++++-
>>  lib/timeval.c                   |  83 +++++++++
>>  lib/timeval.h                   |   7 +
>>  11 files changed, 508 insertions(+), 89 deletions(-)
>>
>> diff --git a/include/openvswitch/poll-loop.h b/include/openvswitch/poll-loop.h
>> index 532d9caa6..6d0331f6d 100644
>> --- a/include/openvswitch/poll-loop.h
>> +++ b/include/openvswitch/poll-loop.h
>> @@ -41,11 +41,30 @@
>>  #include <windows.h>
>>  #endif
>>  
>> +#ifdef __linux__
>> +#define OVS_USE_EPOLL
>> +#endif
>> +
>> +#ifdef OVS_USE_EPOLL
>> +#include <sys/epoll.h>
>> +
>> +#define OVS_POLLIN EPOLLIN
>> +#define OVS_POLLOUT EPOLLOUT
>> +#define OVS_POLLERR EPOLLERR
>> +#define OVS_POLLHUP EPOLLHUP
>> +#define OVS_ONESHOT EPOLLONESHOT
>> +#define OVS_POLLNVAL 0
>> +
>> +#else
>> +
>>  #define OVS_POLLIN POLLIN
>>  #define OVS_POLLOUT POLLOUT
>>  #define OVS_POLLERR POLLERR
>>  #define OVS_POLLNVAL POLLNVAL
>>  #define OVS_POLLHUP POLLHUP
>> +#define OVS_ONESHOT (1U << 30)
>> +
>> +#endif 
>>  
>>  #ifdef  __cplusplus
>>  extern "C" {
>> @@ -60,10 +79,43 @@ extern "C" {
>>   * the source code location of the caller.  The function version allows the
>>   * caller to supply a location explicitly, which is useful if the caller's own
>>   * caller would be more useful in log output.  See timer_wait_at() for an
>> - * example. */
>> -void poll_fd_wait_at(int fd, short int events, const char *where);
>> + * example.
>> + * Note - using on fds registered using poll_fd_register() will generate a
>> + * warning as this is not an intended use.
>> + */
>> +void poll_fd_wait_at(int fd, int events, const char *where);
>>  #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>>  
>> +/* Register a fd with a persistence framework if available so it can be served
>> + * "faster" and the caller can be provided with "hints" on what caused the IO
>> + * event.
>> + * If the "hint" argument is supplied it set to point to the pollfd structure
>> + * containing the events passed by the OS in .revents. 
>> + * Note - as the frameworks are OS dependent, the events are limited to what
>> + * can be passed in a .revents which is a short int.
>> + * Limitations - MUST BE registered from the same thread as the one where 
>> + * it will be waited upon.
>> + */
>> +
>> +void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);
>> +#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)
>> +
>> +/* De-register a fd which was registered as "private" with the persistence
>> + * framework
>> + */
>> +
>> +void poll_fd_deregister_at(int fd, const char *where);
>> +#define poll_fd_deregister(fd) poll_fd_deregister_at(fd, OVS_SOURCE_LOCATOR)
>> +
>> +/* Schedule events to wake up the following poll_block() - "private fds"
>> + * Same as poll_fd_wait, but for fds which have been registered and are
>> + * expected to persist. If a "fast" OS fd notification framework is used
>> + * this version of wait may be a NOOP (f.e. for (E)POLLIN events.
>> + */
>> +void private_poll_fd_wait_at(int fd, int events, const char *where);
>> +#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>> +
>> +
>>  #ifdef _WIN32
>>  void poll_wevent_wait_at(HANDLE wevent, const char *where);
>>  #define poll_wevent_wait(wevent) poll_wevent_wait_at(wevent, OVS_SOURCE_LOCATOR)
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 5b5c96d72..ad5db9452 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -1289,7 +1289,7 @@ dpif_netlink_port_poll_wait(const struct dpif *dpif_)
>>      const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>  
>>      if (dpif->port_notifier) {
>> -        nl_sock_wait(dpif->port_notifier, POLLIN);
>> +        nl_sock_wait(dpif->port_notifier, OVS_POLLIN);
>>      } else {
>>          poll_immediate_wake();
>>      }
>> @@ -2756,13 +2756,13 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
>>      }
>>  
>>      for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
>> -        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
>> +        nl_sock_wait(sock_pool[i].nl_sock, OVS_POLLIN);
>>      }
>>  #else
>>      if (dpif->handlers && handler_id < dpif->n_handlers) {
>>          struct dpif_handler *handler = &dpif->handlers[handler_id];
>>  
>> -        poll_fd_wait(handler->epoll_fd, POLLIN);
>> +        poll_fd_wait(handler->epoll_fd, OVS_POLLIN);
>>      }
>>  #endif
>>  }
>> diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
>> index 97d8d1dab..424636e07 100644
>> --- a/lib/fatal-signal.c
>> +++ b/lib/fatal-signal.c
>> @@ -96,6 +96,7 @@ fatal_signal_init(void)
>>          ovs_mutex_init_recursive(&mutex);
>>  #ifndef _WIN32
>>          xpipe_nonblocking(signal_fds);
>> +        poll_fd_register(signal_fds[0], OVS_POLLIN, NULL);
>>  #else
>>          wevent = CreateEvent(NULL, TRUE, FALSE, NULL);
>>          if (!wevent) {
>> @@ -236,9 +237,12 @@ void
>>  fatal_signal_run(void)
>>  {
>>      sig_atomic_t sig_nr;
>> +    char sigbuffer[_POSIX_PIPE_BUF];
>>  
>>      fatal_signal_init();
>>  
>> +    read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
>> +
>>      sig_nr = stored_sig_nr;
>>      if (sig_nr != SIG_ATOMIC_MAX) {
>>          char namebuf[SIGNAL_NAME_BUFSIZE];
>> @@ -271,7 +275,8 @@ fatal_signal_wait(void)
>>  #ifdef _WIN32
>>      poll_wevent_wait(wevent);
>>  #else
>> -    poll_fd_wait(signal_fds[0], OVS_POLLIN);
>> +    /* a noop - schedule for removal */
>> +    private_poll_fd_wait(signal_fds[0], OVS_POLLIN);
>>  #endif
>>  }
>>  
>> diff --git a/lib/latch-unix.c b/lib/latch-unix.c
>> index fea61ab28..5f15b59fe 100644
>> --- a/lib/latch-unix.c
>> +++ b/lib/latch-unix.c
>> @@ -83,5 +83,6 @@ latch_is_set(const struct latch *latch)
>>  void
>>  latch_wait_at(const struct latch *latch, const char *where)
>>  {
>> -    poll_fd_wait_at(latch->fds[0], OVS_POLLIN, where);
>> +    /* Ask for wait and make it one-shot if persistence is in play */
>> +    poll_fd_wait_at(latch->fds[0], OVS_POLLIN | OVS_ONESHOT, where);
>>  }
>> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
>> index ef367e5ea..482400d8d 100644
>> --- a/lib/netdev-afxdp.c
>> +++ b/lib/netdev-afxdp.c
>> @@ -184,7 +184,7 @@ xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
>>  
>>      if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
>>          pfd.fd = fd;
>> -        pfd.events = OVS_POLLIN;
>> +        pfd.events = POLLIN;
>>  
>>          ret = poll(&pfd, 1, 0);
>>          if (OVS_UNLIKELY(ret < 0)) {
>> diff --git a/lib/poll-loop.c b/lib/poll-loop.c
>> index 3902d6c1f..10a5b0c01 100644
>> --- a/lib/poll-loop.c
>> +++ b/lib/poll-loop.c
>> @@ -18,6 +18,12 @@
>>  #include "openvswitch/poll-loop.h"
>>  #include <errno.h>
>>  #include <inttypes.h>
>> +#ifdef OVS_USE_EPOLL
>> +#include <sys/epoll.h>
>> +#endif
>> +#ifndef _WIN32
>> +#include <unistd.h>
>> +#endif
>>  #include <poll.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> @@ -31,7 +37,9 @@
>>  #include "timeval.h"
>>  #include "openvswitch/vlog.h"
>>  #include "openvswitch/hmap.h"
>> +#include "openvswitch/list.h"
>>  #include "hash.h"
>> +#include "ovs-atomic.h"
>>  
>>  VLOG_DEFINE_THIS_MODULE(poll_loop);
>>  
>> @@ -43,21 +51,32 @@ struct poll_node {
>>      struct pollfd pollfd;       /* Events to pass to time_poll(). */
>>      HANDLE wevent;              /* Events for WaitForMultipleObjects(). */
>>      const char *where;          /* Where poll_node was created. */
>> +    bool valid;                 /* Can it be used? */
>> +    bool private;               /* Can we assume that it is only in this thread poll loop? */
>>  };
>>  
>> +#define MAX_EPOLL_EVENTS 64
>> +
>>  struct poll_loop {
>> -    /* All active poll waiters. */
>> +    /* List of all poll loops in the system */
>> +    struct ovs_mutex loop_mutex;
>> +    /* All poll waiters for this poll loop */
>>      struct hmap poll_nodes;
>>  
>>      /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
>>       * wake up immediately, or LLONG_MAX to wait forever. */
>>      long long int timeout_when; /* In msecs as returned by time_msec(). */
>>      const char *timeout_where;  /* Where 'timeout_when' was set. */
>> +#ifdef OVS_USE_EPOLL
>> +    int epoll_fd;
>> +    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
>> +#endif
>>  };
>>  
>> +
>>  static struct poll_loop *poll_loop(void);
>>  
>> -/* Look up the node with same fd or wevent. */
>> +/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */
>>  static struct poll_node *
>>  find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>>  {
>> @@ -76,79 +95,142 @@ find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>>      }
>>      return NULL;
>>  }
>> -
>> -/* On Unix based systems:
>> - *
>> - *     Registers 'fd' as waiting for the specified 'events' (which should be
>> - *     OVS_POLLIN or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to
>> - *     poll_block() will wake up when 'fd' becomes ready for one or more of the
>> - *     requested events. The 'fd's are given to poll() function later.
>> - *
>> - * On Windows system:
>> +/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
>> + * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
>> + * wake up when 'fd' becomes ready for one or more of the requested events.
>>   *
>> - *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
>> - *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
>> - *     it is assumed that it is unrelated to any sockets and poll_block()
>> - *     will wake up on any event on that 'wevent'. It is an error to pass
>> - *     both 'wevent' and 'fd'.
>> + * The event registration is PERSISTENT. This is intended for OSes which have a persistent
>> + * event framework. For now it is implemented only for epoll and Linux, other
>> + * implementations such as BSD kqueue and Solaris /dev/poll may follow.
>>   *
>> - * The event registration is one-shot: only the following call to
>> - * poll_block() is affected.  The event will need to be re-registered after
>> - * poll_block() is called if it is to persist.
>> + * If the OS has no persistent even framework does nothing
>>   *
>>   * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>>   * automatically provide the caller's source file and line number for
>>   * 'where'.) */
>> +
>>  static void
>> -poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
>> +poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)
>>  {
>>      struct poll_loop *loop = poll_loop();
>>      struct poll_node *node;
>> +#ifdef OVS_USE_EPOLL
>> +    struct epoll_event event;
>> +#endif
>>  
>> -    COVERAGE_INC(poll_create_node);
>> -
>> -    /* Both 'fd' and 'wevent' cannot be set. */
>>      ovs_assert(!fd != !wevent);
>>  
>> +    /* This is mostly uncontended, so the thread should grab it straight away.
>> +     * We will reuse it later to introduce threading for IO and SSL
>> +     */
>> +    ovs_mutex_lock(&loop->loop_mutex);
>> +
>>      /* Check for duplicate.  If found, "or" the events. */
>>      node = find_poll_node(loop, fd, wevent);
>> -    if (node) {
>> -        node->pollfd.events |= events;
>> -    } else {
>> -        node = xzalloc(sizeof *node);
>> -        hmap_insert(&loop->poll_nodes, &node->hmap_node,
>> -                    hash_2words(fd, (uint32_t)wevent));
>> -        node->pollfd.fd = fd;
>> -        node->pollfd.events = events;
>> -#ifdef _WIN32
>> -        if (!wevent) {
>> -            wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
>> +
>> +    if (node && node->valid) {
>> +#ifdef OVS_USE_EPOLL
>> +        int old_event_mask = node->pollfd.events;
>> +#endif
>> +        /* If there is an existing event mask we do not need to inc - this will be waited upon */
>> +        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */
>> +
>> +#ifdef OVS_USE_EPOLL
>> +        /* modify existing epoll entry if there is an epoll specific ask or if the
>> +         * mask has changed
>> +         */
>> +        if ((events & 0xFFFF0000) || (old_event_mask != node->pollfd.events)) {
>> +            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;
>> +            event.data.ptr = node;
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
>>          }
>>  #endif
>> +    } else {
>> +        if (!node) {
>> +            node = xzalloc(sizeof *node);
>> +            hmap_insert(&loop->poll_nodes, &node->hmap_node,
>> +                        hash_2words(fd, 0));
>> +        } else {
>> +            /* node marked for reaping, OS has reused the fd number, valid is set to false */
>> +#ifdef OVS_USE_EPOLl
> 
> This should be "#ifdef OVS_USE_EPOLL"
> 
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, fd, NULL);
>> +#endif
>> +        }
>> +        node->pollfd.fd = fd;
>> +        node->pollfd.events = (events & 0x0000FFFF);
>>          node->wevent = wevent;
>>          node->where = where;
>> +        node->valid = true;
>> +        node->private = private;
>> +#ifdef OVS_USE_EPOLL
>> +        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */
>> +        event.data.ptr = node;
>> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
>> +#endif
>> +    }
>> +    if (hint) {
>> +        *hint = &node->pollfd;
>>      }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>> +}
>> +
>> +void
>> +poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {
>> +    poll_fd_subscribe_at(fd, 0, events, hint, where , true);
>> +}
>> +
>> +/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)
>> + * but it is not.
>> + * In order to be compatible with existing calling conventions while using fd persistence
>> + * where supported we have to keep "legacy" fds around for the duration of the life of
>> + * the thread because we have no idea if they have been reaped properly or not.
>> + * The reason for this is that for some of them the close() is in a thread different from the
>> + * poll loop.
>> + * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the
>> + * same fd number, we will reuse the existing has entry.
>> + */
>> +
>> +void
>> +poll_fd_deregister_at(int fd, const char *where) {
>> +    struct poll_loop *loop = poll_loop();
>> +
>> +    VLOG(VLL_DBG, "Deregister %d from %s", fd, where);
>> +    struct poll_node *node;
>> +
>> +    ovs_mutex_lock(&loop->loop_mutex);
>> +    node = find_poll_node(loop, fd, 0);
>> +    if (node) {
>> +        if (node->private) {
>> +#ifdef OVN_USE_EPOLL
> 
> This should be "#ifdef OVS_USE_EPOLL".
> 
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
>> +#endif
>> +            hmap_remove(&loop->poll_nodes, &node->hmap_node);
>> +        } else {
>> +            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);
>> +            node->valid = false;
>> +        }
>> +    }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>> +}
>> +
>> +void
>> +poll_fd_wait_at(int fd, int events, const char *where)
>> +{
>> +    poll_fd_subscribe_at(fd, 0, events, NULL, where, false);
>>  }
>>  
>> -/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
>> - * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
>> - * wake up when 'fd' becomes ready for one or more of the requested events.
>> - *
>> - * On Windows, 'fd' must be a socket.
>> - *
>> - * The event registration is one-shot: only the following call to poll_block()
>> - * is affected.  The event will need to be re-registered after poll_block() is
>> - * called if it is to persist.
>> - *
>> - * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>> - * automatically provide the caller's source file and line number for
>> - * 'where'.) */
>>  void
>> -poll_fd_wait_at(int fd, short int events, const char *where)
>> +private_poll_fd_wait_at(int fd, int events, const char *where)
>>  {
>> -    poll_create_node(fd, 0, events, where);
>> +    /* POLLIN persists on "private" fds - either emulated or at epoll
>> +     * or other persistence framework level
>> +     */
>> +    if (events & (~OVS_POLLIN)) {
>> +        poll_fd_subscribe_at(fd, 0, events, NULL, where, true);
>> +    }
>>  }
>>  
>> +
>>  #ifdef _WIN32
>>  /* Registers for the next call to poll_block() to wake up when 'wevent' is
>>   * signaled.
>> @@ -163,7 +245,7 @@ poll_fd_wait_at(int fd, short int events, const char *where)
>>  void
>>  poll_wevent_wait_at(HANDLE wevent, const char *where)
>>  {
>> -    poll_create_node(0, wevent, 0, where);
>> +    poll_fd_subscribe_at(0, wevent, 0, NULL, where);
>>  }
>>  #endif /* _WIN32 */
>>  
>> @@ -277,9 +359,12 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>>          if (pollfd->revents & OVS_POLLHUP) {
>>              ds_put_cstr(&s, "[OVS_POLLHUP]");
>>          }
>> +#ifndef OVS_USE_EPOLL
>> +        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/
>>          if (pollfd->revents & OVS_POLLNVAL) {
>>              ds_put_cstr(&s, "[OVS_POLLNVAL]");
>>          }
>> +#endif
>>          ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
>>          free(description);
>>      } else {
>> @@ -295,12 +380,17 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>>      ds_destroy(&s);
>>  }
>>  
>> +
>>  static void
>>  free_poll_nodes(struct poll_loop *loop)
>>  {
>>      struct poll_node *node, *next;
>>  
>> +    ovs_mutex_lock(&loop->loop_mutex);
>>      HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
>> +#ifdef OVS_USE_EPOLL
>> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
>> +#endif
>>          hmap_remove(&loop->poll_nodes, &node->hmap_node);
>>  #ifdef _WIN32
>>          if (node->wevent && node->pollfd.fd) {
>> @@ -310,6 +400,7 @@ free_poll_nodes(struct poll_loop *loop)
>>  #endif
>>          free(node);
>>      }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>>  }
>>  
>>  /* Blocks until one or more of the events registered with poll_fd_wait()
>> @@ -320,8 +411,13 @@ poll_block(void)
>>  {
>>      struct poll_loop *loop = poll_loop();
>>      struct poll_node *node;
>> +#ifndef OVS_USE_EPOLL
>>      struct pollfd *pollfds;
>> +#endif
>> +#ifndef OVS_USE_EPOLL
>>      HANDLE *wevents = NULL;
>> +    int counter;
>> +#endif
>>      int elapsed;
>>      int retval;
>>      int i;
>> @@ -335,54 +431,126 @@ poll_block(void)
>>      }
>>  
>>      timewarp_run();
>> -    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>>  
>> +#ifdef OVS_USE_EPOLL
>> +    retval = time_epoll_wait(loop->epoll_fd,
>> +        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
>> +    if (retval < 0) {
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +        VLOG_ERR_RL(&rl, "epoll: %s", ovs_strerror(retval));
>> +    } else if (!retval) {
>> +        log_wakeup(loop->timeout_where, NULL, elapsed);
>> +    } else {
>> +        ovs_mutex_lock(&loop->loop_mutex);
>> +        if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> +            for (i = 0; i < retval; i++) {
>> +                node = (struct poll_node *) loop->epoll_events[i].data.ptr;
>> +                if (loop->epoll_events[i].events) {
>> +                    node->pollfd.revents = loop->epoll_events[i].events;
>> +                    log_wakeup(node->where, &node->pollfd, 0);
>> +                }
>> +            }
>> +        }
>> +        for (i = 0; i < retval; i++) {
>> +            node = (struct poll_node *) loop->epoll_events[i].data.ptr;
>> +            if (loop->epoll_events[i].events & EPOLLHUP) {
>> +                /* File descriptor closed already elsewhere
>> +                 * We have to make the assumption that whoever closed it has
>> +                 * ensured that anything which refers to IO event hints will not run
>> +                 * on this fd after we free it.
>> +                 */
>> +                node->valid = false;
>> +            }
>> +            if (loop->epoll_events[i].events) {
>> +                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);
>> +            }
>> +            if (loop->epoll_events[i].events & OVS_POLLOUT) {
>> +                struct epoll_event event;
>> +                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */
>> +                event.events = node->pollfd.events;
>> +                event.data.ptr = node;
>> +                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);
>> +            }
>> +        }
>> +        ovs_mutex_unlock(&loop->loop_mutex);
>> +    }
>> +#else
>> +    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>>  #ifdef _WIN32
>>      wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
>>  #endif
>>  
>> +
>>      /* Populate with all the fds and events. */
>> -    i = 0;
>> +    counter = 0;
>>      HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
>> -        pollfds[i] = node->pollfd;
>> +        if ((node->valid) && (node->pollfd.events)) {
>> +            pollfds[counter] = node->pollfd;
>>  #ifdef _WIN32
>> -        wevents[i] = node->wevent;
>> -        if (node->pollfd.fd && node->wevent) {
>> -            short int wsa_events = 0;
>> -            if (node->pollfd.events & OVS_POLLIN) {
>> -                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
>> +            wevents[counter] = node->wevent;
>> +            if (node->pollfd.fd && node->wevent) {
>> +                short int wsa_events = 0;
>> +                if (node->pollfd.events & OVS_POLLIN) {
>> +                    wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
>> +                }
>> +                if (node->pollfd.events & OVS_POLLOUT) {
>> +                    wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
>> +                }
>> +                WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>>              }
>> -            if (node->pollfd.events & OVS_POLLOUT) {
>> -                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
>> -            }
>> -            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>> -        }
>>  #endif
>> -        i++;
>> +            counter++;
>> +        }
>>      }
>>  
>> -    retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
>> +    retval = time_poll(pollfds, counter, wevents,
>>                         loop->timeout_when, &elapsed);
>>      if (retval < 0) {
>>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>          VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
>> -    } else if (!retval) {
>> +    } else if (retval == 0) {
>>          log_wakeup(loop->timeout_where, NULL, elapsed);
>> -    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> -        i = 0;
>> -        HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
>> +    } else {
>> +        for (i = 0; i < counter; i++) {
>>              if (pollfds[i].revents) {
>> -                log_wakeup(node->where, &pollfds[i], 0);
>> +
>> +                node = find_poll_node(loop, pollfds[i].fd, 0);
>> +
>> +                if (!node) {
>> +                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
>> +                }
>> +                if (pollfds[i].revents & (OVS_POLLHUP | OVS_POLLNVAL)) {
>> +                    node->valid = false;
>> +                }
>> +
>> +                if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> +                    log_wakeup(node->where, &pollfds[i], 0);
>> +                }
>> +                /* update "requested" events. 
>> +                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc
>> +                 * behaviour which they should be using in real life instead of using poll()
>> +                 */
>> +                if (node->private) {
>> +                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));
>> +                } else {
>> +                    node->pollfd.events &= ~pollfds[i].revents;
>> +                }
>> +                /* update "occured" events for use by streams and handlers. In case there
>> +                 * is an existing (but not consumed yet) event, we OR the events in the
>> +                 * stored record with the new ones - it is the job of the stream to clear
>> +                 * that.
>> +                 */
>> +                node->pollfd.revents |= pollfds[i].revents;
>>              }
>> -            i++;
>>          }
>>      }
>>  
>> -    free_poll_nodes(loop);
>> +    free(pollfds);
>> +    if (wevents)
>> +        free(wevents);
>> +#endif
>>      loop->timeout_when = LLONG_MAX;
>>      loop->timeout_where = NULL;
>> -    free(pollfds);
>> -    free(wevents);
>>  
>>      /* Handle any pending signals before doing anything else. */
>>      fatal_signal_run();
>> @@ -416,8 +584,12 @@ poll_loop(void)
>>      if (!loop) {
>>          loop = xzalloc(sizeof *loop);
>>          loop->timeout_when = LLONG_MAX;
>> +        ovs_mutex_init(&loop->loop_mutex);
>>          hmap_init(&loop->poll_nodes);
>>          xpthread_setspecific(key, loop);
>> +#ifdef OVS_USE_EPOLL
>> +        loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
>> +#endif
>>      }
>>      return loop;
>>  }
>> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
>> index 3dfa80c7f..16d155989 100644
>> --- a/lib/route-table-bsd.c
>> +++ b/lib/route-table-bsd.c
>> @@ -34,6 +34,7 @@
>>  #include "ovs-router.h"
>>  #include "packets.h"
>>  #include "openvswitch/vlog.h"
>> +#include "openvswitch/poll-loop.h"
>>  #include "util.h"
>>  
>>  VLOG_DEFINE_THIS_MODULE(route_table_bsd);
>> diff --git a/lib/stream-fd.c b/lib/stream-fd.c
>> index 62f768d45..6a80d6e05 100644
>> --- a/lib/stream-fd.c
>> +++ b/lib/stream-fd.c
>> @@ -40,6 +40,8 @@ struct stream_fd
>>      struct stream stream;
>>      int fd;
>>      int fd_type;
>> +    bool rx_ready, tx_ready;
>> +    struct pollfd *hint;
>>  };
>>  
>>  static const struct stream_class stream_fd_class;
>> @@ -67,7 +69,14 @@ new_fd_stream(char *name, int fd, int connect_status, int fd_type,
>>      stream_init(&s->stream, &stream_fd_class, connect_status, name);
>>      s->fd = fd;
>>      s->fd_type = fd_type;
>> +    s->rx_ready = true;
>> +    s->tx_ready = true;
>> +    s->hint = NULL;
>>      *streamp = &s->stream;
>> +    /* Persistent registration - we always get POLLINs from now on,
>> +     * POLLOUTs when we ask for them
>> +     */
>> +    poll_fd_register(s->fd, OVS_POLLIN, &s->hint);
>>      return 0;
>>  }
>>  
>> @@ -82,6 +91,8 @@ static void
>>  fd_close(struct stream *stream)
>>  {
>>      struct stream_fd *s = stream_fd_cast(stream);
>> +    /* Deregister the FD from any persistent registrations if supported */
>> +    poll_fd_deregister(s->fd);
>>      closesocket(s->fd);
>>      free(s);
>>  }
>> @@ -104,6 +115,24 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>>      ssize_t retval;
>>      int error;
>>  
>> +    if (s->hint) {
>> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
>> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
>> +         * for all other cases we belive what (e)poll has fed us.
>> +         */
>> +        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {
>> +            if (!(s->hint->revents & OVS_POLLIN)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready */
>> +                s->rx_ready = true;
>> +                s->hint->revents &= ~OVS_POLLIN;
>> +            }
>> +        } else {
>> +            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
>> +        }
>> +    }
>> +
>>      retval = recv(s->fd, buffer, n, 0);
>>      if (retval < 0) {
>>          error = sock_errno();
>> @@ -114,6 +143,8 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>>  #endif
>>          if (error != EAGAIN) {
>>              VLOG_DBG_RL(&rl, "recv: %s", sock_strerror(error));
>> +        } else {
>> +            s->rx_ready = false;
>>          }
>>          return -error;
>>      }
>> @@ -127,9 +158,29 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>>      ssize_t retval;
>>      int error;
>>  
>> +    if (s->hint) {
>> +        /* poll-loop is providing us with hints for IO */
>> +        if (!s->tx_ready) {
>> +            if (!(s->hint->revents & OVS_POLLOUT)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLOUT event from poll loop, mark us as ready */
>> +                s->tx_ready = true;
>> +                s->hint->revents &= ~OVS_POLLOUT;
>> +            }
>> +        }
>> +    }
>>      retval = send(s->fd, buffer, n, 0);
>>      if (retval < 0) {
>>          error = sock_errno();
>> +#ifdef __linux__
>> +        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen
>> +         *  on unix domain sockets 
>> +         */
>> +        if (error == ENOBUFS) {
>> +           error = EAGAIN;
>> +        }
>> +#endif
>>  #ifdef _WIN32
>>          if (error == WSAEWOULDBLOCK) {
>>             error = EAGAIN;
>> @@ -137,6 +188,8 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>>  #endif
>>          if (error != EAGAIN) {
>>              VLOG_DBG_RL(&rl, "send: %s", sock_strerror(error));
>> +        } else {
>> +            s->tx_ready = false;
>>          }
>>          return -error;
>>      }
>> @@ -150,11 +203,11 @@ fd_wait(struct stream *stream, enum stream_wait_type wait)
>>      switch (wait) {
>>      case STREAM_CONNECT:
>>      case STREAM_SEND:
>> -        poll_fd_wait(s->fd, OVS_POLLOUT);
>> +        private_poll_fd_wait(s->fd, OVS_POLLOUT);
>>          break;
>>  
>>      case STREAM_RECV:
>> -        poll_fd_wait(s->fd, OVS_POLLIN);
>> +        private_poll_fd_wait(s->fd, OVS_POLLIN);
>>          break;
>>  
>>      default:
>> @@ -223,6 +276,8 @@ new_fd_pstream(char *name, int fd,
>>      ps->accept_cb = accept_cb;
>>      ps->unlink_path = unlink_path;
>>      *pstreamp = &ps->pstream;
>> +    /* persistent registration */
>> +    poll_fd_register(ps->fd, OVS_POLLIN, NULL);
>>      return 0;
>>  }
>>  
>> @@ -230,6 +285,7 @@ static void
>>  pfd_close(struct pstream *pstream)
>>  {
>>      struct fd_pstream *ps = fd_pstream_cast(pstream);
>> +    poll_fd_deregister(ps->fd);
>>      closesocket(ps->fd);
>>      maybe_unlink_and_free(ps->unlink_path);
>>      free(ps);
>> @@ -271,7 +327,7 @@ static void
>>  pfd_wait(struct pstream *pstream)
>>  {
>>      struct fd_pstream *ps = fd_pstream_cast(pstream);
>> -    poll_fd_wait(ps->fd, OVS_POLLIN);
>> +    private_poll_fd_wait(ps->fd, OVS_POLLIN);
>>  }
>>  
>>  static const struct pstream_class fd_pstream_class = {
>> diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
>> index 3b7f9865e..53ae51c1b 100644
>> --- a/lib/stream-ssl.c
>> +++ b/lib/stream-ssl.c
>> @@ -147,6 +147,7 @@ struct ssl_stream
>>      /* A few bytes of header data in case SSL negotiation fails. */
>>      uint8_t head[2];
>>      short int n_head;
>> +    struct pollfd *hint;
>>  };
>>  
>>  /* SSL context created by ssl_init(). */
>> @@ -310,6 +311,8 @@ new_ssl_stream(char *name, char *server_name, int fd, enum session_type type,
>>          SSL_set_msg_callback_arg(ssl, sslv);
>>      }
>>  
>> +
>> +    poll_fd_register(sslv->fd, OVS_POLLIN, &sslv->hint);
>>      *streamp = &sslv->stream;
>>      free(server_name);
>>      return 0;
>> @@ -604,6 +607,7 @@ ssl_close(struct stream *stream)
>>      ERR_clear_error();
>>  
>>      SSL_free(sslv->ssl);
>> +    poll_fd_deregister(sslv->fd);
>>      closesocket(sslv->fd);
>>      free(sslv);
>>  }
>> @@ -697,6 +701,27 @@ ssl_recv(struct stream *stream, void *buffer, size_t n)
>>      /* Behavior of zero-byte SSL_read is poorly defined. */
>>      ovs_assert(n > 0);
>>  
>> +     if (sslv->hint) {
>> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
>> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
>> +         * for all other cases we belive what (e)poll has fed us.
>> +         */
>> +        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {
>> +            if (!(sslv->hint->revents & OVS_POLLIN)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready 
>> +                 * rx_want is cleared further down by reading ssl fsm
>> +                 */
>> +                sslv->hint->revents &= ~OVS_POLLIN;
>> +            }
>> +        } else {
>> +            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
>> +        }
>> +    }
>> +
>> +
>> +
>>      old_state = SSL_get_state(sslv->ssl);
>>      ret = SSL_read(sslv->ssl, buffer, n);
>>      if (old_state != SSL_get_state(sslv->ssl)) {
>> @@ -729,6 +754,19 @@ ssl_do_tx(struct stream *stream)
>>  {
>>      struct ssl_stream *sslv = ssl_stream_cast(stream);
>>  
>> +     if (sslv->hint) {
>> +        /* poll-loop is providing us with hints for IO */
>> +        if (sslv->tx_want == SSL_WRITING) {
>> +            if (!(sslv->hint->revents & OVS_POLLOUT)) {
>> +                return EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready 
>> +                 * rx_want is cleared further down by reading ssl fsm
>> +                 */
>> +                sslv->hint->revents &= ~OVS_POLLOUT;
>> +            }
>> +        }
>> +    }
>>      for (;;) {
>>          int old_state = SSL_get_state(sslv->ssl);
>>          int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
>> @@ -771,6 +809,8 @@ ssl_send(struct stream *stream, const void *buffer, size_t n)
>>              ssl_clear_txbuf(sslv);
>>              return n;
>>          case EAGAIN:
>> +            /* we want to know when this fd will become available again */
>> +            stream_send_wait(stream);
>>              return n;
>>          default:
>>              ssl_clear_txbuf(sslv);
>> @@ -795,7 +835,7 @@ ssl_run_wait(struct stream *stream)
>>      struct ssl_stream *sslv = ssl_stream_cast(stream);
>>  
>>      if (sslv->tx_want != SSL_NOTHING) {
>> -        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>> +        private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>>      }
>>  }
>>  
>> @@ -811,13 +851,13 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>>          } else {
>>              switch (sslv->state) {
>>              case STATE_TCP_CONNECTING:
>> -                poll_fd_wait(sslv->fd, OVS_POLLOUT);
>> +                private_poll_fd_wait(sslv->fd, OVS_POLLOUT);
>>                  break;
>>  
>>              case STATE_SSL_CONNECTING:
>>                  /* ssl_connect() called SSL_accept() or SSL_connect(), which
>>                   * set up the status that we test here. */
>> -                poll_fd_wait(sslv->fd,
>> +                private_poll_fd_wait(sslv->fd,
>>                                 want_to_poll_events(SSL_want(sslv->ssl)));
>>                  break;
>>  
>> @@ -829,7 +869,7 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>>  
>>      case STREAM_RECV:
>>          if (sslv->rx_want != SSL_NOTHING) {
>> -            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>> +            private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>>          } else {
>>              poll_immediate_wake();
>>          }
>> @@ -911,6 +951,7 @@ pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
>>                   ds_steal_cstr(&bound_name));
>>      pstream_set_bound_port(&pssl->pstream, htons(port));
>>      pssl->fd = fd;
>> +    poll_fd_register(fd, OVS_POLLIN, NULL);
>>      *pstreamp = &pssl->pstream;
>>  
>>      return 0;
>> @@ -920,6 +961,7 @@ static void
>>  pssl_close(struct pstream *pstream)
>>  {
>>      struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
>> +    poll_fd_deregister(pssl->fd);
>>      closesocket(pssl->fd);
>>      free(pssl);
>>  }
>> diff --git a/lib/timeval.c b/lib/timeval.c
>> index 193c7bab1..59a12414f 100644
>> --- a/lib/timeval.c
>> +++ b/lib/timeval.c
>> @@ -38,6 +38,7 @@
>>  #include "unixctl.h"
>>  #include "util.h"
>>  #include "openvswitch/vlog.h"
>> +#include "openvswitch/poll-loop.h"
>>  
>>  VLOG_DEFINE_THIS_MODULE(timeval);
>>  
>> @@ -369,6 +370,88 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED,
>>      return retval;
>>  }
>>  
>> +#ifdef OVS_USE_EPOLL
>> +
>> +/* Like epoll_wait(), except:
>> + *
>> + *      - The timeout is specified as an absolute time, as defined by
>> + *        time_msec(), instead of a duration.
>> + *
>> + *      - On error, returns a negative error code (instead of setting errno).
>> + *
>> + *      - If interrupted by a signal, retries automatically until the original
>> + *        timeout is reached.  (Because of this property, this function will
>> + *        never return -EINTR.)
>> + *
>> + * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
>> +int
>> +time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
>> +          long long int timeout_when, int *elapsed)
>> +{
>> +    long long int *last_wakeup = last_wakeup_get();
>> +    long long int start;
>> +    bool quiescent;
>> +    int retval = 0;
>> +
>> +    time_init();
>> +    coverage_clear();
>> +    coverage_run();
>> +    if (*last_wakeup && !thread_is_pmd()) {
>> +        log_poll_interval(*last_wakeup);
>> +    }
>> +    start = time_msec();
>> +
>> +    timeout_when = MIN(timeout_when, deadline);
>> +    quiescent = ovsrcu_is_quiescent();
>> +
>> +    for (;;) {
>> +        long long int now = time_msec();
>> +        int time_left;
>> +
>> +        if (now >= timeout_when) {
>> +            time_left = 0;
>> +        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
>> +            time_left = INT_MAX;
>> +        } else {
>> +            time_left = timeout_when - now;
>> +        }
>> +
>> +        if (!quiescent) {
>> +            if (!time_left) {
>> +                ovsrcu_quiesce();
>> +            } else {
>> +                ovsrcu_quiesce_start();
>> +            }
>> +        }
>> +
>> +        retval = epoll_wait(epoll_fd, events, max, time_left);
>> +        if (retval < 0) {
>> +            retval = -errno;
>> +        }
>> +
>> +        if (!quiescent && time_left) {
>> +            ovsrcu_quiesce_end();
>> +        }
>> +
>> +        if (deadline <= time_msec()) {
>> +            fatal_signal_handler(SIGALRM);
>> +            if (retval < 0) {
>> +                retval = 0;
>> +            }
>> +            break;
>> +        }
>> +
>> +        if (retval != -EINTR) {
>> +            break;
>> +        }
>> +    }
>> +    *last_wakeup = time_msec();
>> +    refresh_rusage();
>> +    *elapsed = *last_wakeup - start;
>> +    return retval;
>> +}
>> +#endif
>> +
>>  long long int
>>  timespec_to_msec(const struct timespec *ts)
>>  {
>> diff --git a/lib/timeval.h b/lib/timeval.h
>> index 502f703d4..347a09d63 100644
>> --- a/lib/timeval.h
>> +++ b/lib/timeval.h
>> @@ -20,6 +20,9 @@
>>  #include <time.h>
>>  #include "openvswitch/type-props.h"
>>  #include "util.h"
>> +#ifdef __linux__
>> +#include <sys/epoll.h>
>> +#endif
>>  
>>  #ifdef  __cplusplus
>>  extern "C" {
>> @@ -61,6 +64,10 @@ void time_wall_timespec(struct timespec *);
>>  void time_alarm(unsigned int secs);
>>  int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
>>                long long int timeout_when, int *elapsed);
>> +#ifdef __linux__
>> +int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
>> +          long long int timeout_when, int *elapsed);
>> +#endif
>>  
>>  long long int timespec_to_msec(const struct timespec *);
>>  long long int timespec_to_usec(const struct timespec *);
>>
>
Anton Ivanov Feb. 18, 2020, 6:12 a.m. UTC | #5
On 17/02/2020 14:48, Dumitru Ceara wrote:
> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>> These are waited upon in the same thread where they are created. This
>> allows them to be registered persistently with the OS (if possible)
>> as well as the OS to provide hints - is the FD ready, is it closed,
>> etc.
>>
>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>> which is not ready if that fd has been registered as "private" to the
>> thread which waits upon it.
>>
>> 3. No longer breaks other parts of OVS which create the fd in one
>> thread and waits upon it in others.
>>
>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>> poll++ frameworks in other OSes.
>>
>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> Hi Anton,
> 
> A couple of issues inline. Except for that:
> 
> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
> failing with your patches applied:
> 
> make check TESTSUITEFLAGS='-k "flush the fdb"'

I will have a look.

> 
> 2. Travis CI build fails:
> 
> lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
> declared with attribute warn_unused_result [-Werror=unused-result]
> 
>       read(signal_fds[0], sigbuffer, sizeof(sigbuffer))
> 
> 3. Travis CI OSX build fails:
> 
> lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
> [-Werror,-Wunused-function]
> 
> COVERAGE_DEFINE(poll_create_node);

I will fix all of these in the next version. The CI was out with the 
daisies when I submitted the patch last week so I did not see the logs 
until yesterday.

> 
> 4. While OVS might benefit from these changes I'm wondering about OVN
> and ovsdb-server specifically. ovsdb-server is single threaded and
> usually on large scale deployments we don't really see "poll" as the
> bottleneck or even the fact that code tries to read/write from FDs when
> FDs are not available for read/write.
> 
> For example, here are results of running a scale test scenario which
> repeats the following iteration 300 times:
> - bring up a node (ovn-fake-multinode container) and connect it to the
> OVN Southbound DB.
> - configure an OVN logical switch to be bound to the new node.
> - configure an OVN logical switch port on the new logical switch.
> - configure an OVS internal interface on the new node and bind it to the
> OVN logical switch port.
> - wait until the new internal interface can ping its default gateway
> through OVN (i.e., until ovn-controller on the node received all updates
> from the SB DB and installed all OVS flows), highlighted in the output.
> 
> The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
> running OVN ovsdb-servers and ovn-northd and 8 machines simulating
> chassis using ovn-fake-multinode), in particular this modified scenario:
> https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json
> 
> With OVS master and OVN master:
> http://pastebin.test.redhat.com/836568
> 
> With OVS master + your patches and OVN master:
> http://pastebin.test.redhat.com/836571
> 
> Here are some of the logs we get on the OVN Southbound DB ovsdb-server
> that show that ovsdb-server spends up to 2 seconds in a single loop
> iteration sending/receiving updates to/from ovn-controllers:
> 
> 2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
> on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
> (84% CPU usage)
> 2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
> poll interval (2144ms user, 9ms system)
> 2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
> 2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
> 2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
> voluntary, 4 involuntary
> 2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
> last 2 seconds (most recently, 2 seconds ago) due to excessive rate
> 2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
> on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
> (84% CPU usage)
> 2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
> poll interval (2129ms user, 17ms system)
> 2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
> 2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
> voluntary, 7 involuntary
> 2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
> poll interval (2136ms user, 10ms system)
> 
> In this case, and I think in most OVN use cases, ovsdb-server is busy
> because it actually has to send updates to large numbers of
> ovn-controllers connected to it. Unless I'm missing something the epoll
> change seems to improve performance only in cases where the Southbound
> DB doesn't do much sending/receiving. 

1. Correct - it improves the handling and the cost of idle connections. 
At present each connection is a fixed cost regardless of does it need 
servicing or not.

2. It should also improve the cost of handling of many peers to send if 
there is enough outstanding data on sockets to create EGAIN on send.

3. It does not fix the fundamental problem that the logic in ovsdb is 
single threaded. It does, however allow json+io+ssl to become 
multi-threaded which should leave just the logic in the main ovsd-server 
thread. You cannot do that effectively without having information on the 
state of the socket and does it need servicing.

4. I have seen the rally tests - they are flat scale-up. I agree that 
the help from fixing the IO loop will be minimal because there is IO on 
most fds to be serviced at all times. I would not expect them to help a 
lot there.

The difference from fixing the IO (leaving aside that it is a 
prerequisite to getting SSL to worker threads) is not in scaling up, but 
running at scale. The current "ram the EGAIN wall until it gives up" 
design gives you a mandatory penalty per transaction deploying config to 
one node while running (because of attempted failed reads on EAGAIN). 
The correct test for that is the CPU cost and latency to deploy a 
logical flow when running at steady state in f.e. 300 nodes.

The current design as written has that penalty as a given - it is 
unavoidable and it also grows linear (or worse) with size from other 
factors in addition to EGAIN - f.e you also start copying a large pollfd 
array to the kernel back and fourth on every iteration.

I have not seen any tests trying to quantify this on a actual cluster. 
All tests I have seen so far are scale-up and/or running a single 
synthetic application on the whole cluster.

Mine are mostly taking the code in question and running it outside of 
OVS/OVN on a harness.


> How do you test
> performance/scalability improvements?
> 
> Regards,
> Dumitru
> 
>> ---
>>   include/openvswitch/poll-loop.h |  56 +++++-
>>   lib/dpif-netlink.c              |   6 +-
>>   lib/fatal-signal.c              |   7 +-
>>   lib/latch-unix.c                |   3 +-
>>   lib/netdev-afxdp.c              |   2 +-
>>   lib/poll-loop.c                 | 320 ++++++++++++++++++++++++--------
>>   lib/route-table-bsd.c           |   1 +
>>   lib/stream-fd.c                 |  62 ++++++-
>>   lib/stream-ssl.c                |  50 ++++-
>>   lib/timeval.c                   |  83 +++++++++
>>   lib/timeval.h                   |   7 +
>>   11 files changed, 508 insertions(+), 89 deletions(-)
>>
>> diff --git a/include/openvswitch/poll-loop.h b/include/openvswitch/poll-loop.h
>> index 532d9caa6..6d0331f6d 100644
>> --- a/include/openvswitch/poll-loop.h
>> +++ b/include/openvswitch/poll-loop.h
>> @@ -41,11 +41,30 @@
>>   #include <windows.h>
>>   #endif
>>   
>> +#ifdef __linux__
>> +#define OVS_USE_EPOLL
>> +#endif
>> +
>> +#ifdef OVS_USE_EPOLL
>> +#include <sys/epoll.h>
>> +
>> +#define OVS_POLLIN EPOLLIN
>> +#define OVS_POLLOUT EPOLLOUT
>> +#define OVS_POLLERR EPOLLERR
>> +#define OVS_POLLHUP EPOLLHUP
>> +#define OVS_ONESHOT EPOLLONESHOT
>> +#define OVS_POLLNVAL 0
>> +
>> +#else
>> +
>>   #define OVS_POLLIN POLLIN
>>   #define OVS_POLLOUT POLLOUT
>>   #define OVS_POLLERR POLLERR
>>   #define OVS_POLLNVAL POLLNVAL
>>   #define OVS_POLLHUP POLLHUP
>> +#define OVS_ONESHOT (1U << 30)
>> +
>> +#endif
>>   
>>   #ifdef  __cplusplus
>>   extern "C" {
>> @@ -60,10 +79,43 @@ extern "C" {
>>    * the source code location of the caller.  The function version allows the
>>    * caller to supply a location explicitly, which is useful if the caller's own
>>    * caller would be more useful in log output.  See timer_wait_at() for an
>> - * example. */
>> -void poll_fd_wait_at(int fd, short int events, const char *where);
>> + * example.
>> + * Note - using on fds registered using poll_fd_register() will generate a
>> + * warning as this is not an intended use.
>> + */
>> +void poll_fd_wait_at(int fd, int events, const char *where);
>>   #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>>   
>> +/* Register a fd with a persistence framework if available so it can be served
>> + * "faster" and the caller can be provided with "hints" on what caused the IO
>> + * event.
>> + * If the "hint" argument is supplied it set to point to the pollfd structure
>> + * containing the events passed by the OS in .revents.
>> + * Note - as the frameworks are OS dependent, the events are limited to what
>> + * can be passed in a .revents which is a short int.
>> + * Limitations - MUST BE registered from the same thread as the one where
>> + * it will be waited upon.
>> + */
>> +
>> +void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);
>> +#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)
>> +
>> +/* De-register a fd which was registered as "private" with the persistence
>> + * framework
>> + */
>> +
>> +void poll_fd_deregister_at(int fd, const char *where);
>> +#define poll_fd_deregister(fd) poll_fd_deregister_at(fd, OVS_SOURCE_LOCATOR)
>> +
>> +/* Schedule events to wake up the following poll_block() - "private fds"
>> + * Same as poll_fd_wait, but for fds which have been registered and are
>> + * expected to persist. If a "fast" OS fd notification framework is used
>> + * this version of wait may be a NOOP (f.e. for (E)POLLIN events.
>> + */
>> +void private_poll_fd_wait_at(int fd, int events, const char *where);
>> +#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
>> +
>> +
>>   #ifdef _WIN32
>>   void poll_wevent_wait_at(HANDLE wevent, const char *where);
>>   #define poll_wevent_wait(wevent) poll_wevent_wait_at(wevent, OVS_SOURCE_LOCATOR)
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 5b5c96d72..ad5db9452 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -1289,7 +1289,7 @@ dpif_netlink_port_poll_wait(const struct dpif *dpif_)
>>       const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>   
>>       if (dpif->port_notifier) {
>> -        nl_sock_wait(dpif->port_notifier, POLLIN);
>> +        nl_sock_wait(dpif->port_notifier, OVS_POLLIN);
>>       } else {
>>           poll_immediate_wake();
>>       }
>> @@ -2756,13 +2756,13 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
>>       }
>>   
>>       for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
>> -        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
>> +        nl_sock_wait(sock_pool[i].nl_sock, OVS_POLLIN);
>>       }
>>   #else
>>       if (dpif->handlers && handler_id < dpif->n_handlers) {
>>           struct dpif_handler *handler = &dpif->handlers[handler_id];
>>   
>> -        poll_fd_wait(handler->epoll_fd, POLLIN);
>> +        poll_fd_wait(handler->epoll_fd, OVS_POLLIN);
>>       }
>>   #endif
>>   }
>> diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
>> index 97d8d1dab..424636e07 100644
>> --- a/lib/fatal-signal.c
>> +++ b/lib/fatal-signal.c
>> @@ -96,6 +96,7 @@ fatal_signal_init(void)
>>           ovs_mutex_init_recursive(&mutex);
>>   #ifndef _WIN32
>>           xpipe_nonblocking(signal_fds);
>> +        poll_fd_register(signal_fds[0], OVS_POLLIN, NULL);
>>   #else
>>           wevent = CreateEvent(NULL, TRUE, FALSE, NULL);
>>           if (!wevent) {
>> @@ -236,9 +237,12 @@ void
>>   fatal_signal_run(void)
>>   {
>>       sig_atomic_t sig_nr;
>> +    char sigbuffer[_POSIX_PIPE_BUF];
>>   
>>       fatal_signal_init();
>>   
>> +    read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
>> +
>>       sig_nr = stored_sig_nr;
>>       if (sig_nr != SIG_ATOMIC_MAX) {
>>           char namebuf[SIGNAL_NAME_BUFSIZE];
>> @@ -271,7 +275,8 @@ fatal_signal_wait(void)
>>   #ifdef _WIN32
>>       poll_wevent_wait(wevent);
>>   #else
>> -    poll_fd_wait(signal_fds[0], OVS_POLLIN);
>> +    /* a noop - schedule for removal */
>> +    private_poll_fd_wait(signal_fds[0], OVS_POLLIN);
>>   #endif
>>   }
>>   
>> diff --git a/lib/latch-unix.c b/lib/latch-unix.c
>> index fea61ab28..5f15b59fe 100644
>> --- a/lib/latch-unix.c
>> +++ b/lib/latch-unix.c
>> @@ -83,5 +83,6 @@ latch_is_set(const struct latch *latch)
>>   void
>>   latch_wait_at(const struct latch *latch, const char *where)
>>   {
>> -    poll_fd_wait_at(latch->fds[0], OVS_POLLIN, where);
>> +    /* Ask for wait and make it one-shot if persistence is in play */
>> +    poll_fd_wait_at(latch->fds[0], OVS_POLLIN | OVS_ONESHOT, where);
>>   }
>> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
>> index ef367e5ea..482400d8d 100644
>> --- a/lib/netdev-afxdp.c
>> +++ b/lib/netdev-afxdp.c
>> @@ -184,7 +184,7 @@ xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
>>   
>>       if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
>>           pfd.fd = fd;
>> -        pfd.events = OVS_POLLIN;
>> +        pfd.events = POLLIN;
>>   
>>           ret = poll(&pfd, 1, 0);
>>           if (OVS_UNLIKELY(ret < 0)) {
>> diff --git a/lib/poll-loop.c b/lib/poll-loop.c
>> index 3902d6c1f..10a5b0c01 100644
>> --- a/lib/poll-loop.c
>> +++ b/lib/poll-loop.c
>> @@ -18,6 +18,12 @@
>>   #include "openvswitch/poll-loop.h"
>>   #include <errno.h>
>>   #include <inttypes.h>
>> +#ifdef OVS_USE_EPOLL
>> +#include <sys/epoll.h>
>> +#endif
>> +#ifndef _WIN32
>> +#include <unistd.h>
>> +#endif
>>   #include <poll.h>
>>   #include <stdlib.h>
>>   #include <string.h>
>> @@ -31,7 +37,9 @@
>>   #include "timeval.h"
>>   #include "openvswitch/vlog.h"
>>   #include "openvswitch/hmap.h"
>> +#include "openvswitch/list.h"
>>   #include "hash.h"
>> +#include "ovs-atomic.h"
>>   
>>   VLOG_DEFINE_THIS_MODULE(poll_loop);
>>   
>> @@ -43,21 +51,32 @@ struct poll_node {
>>       struct pollfd pollfd;       /* Events to pass to time_poll(). */
>>       HANDLE wevent;              /* Events for WaitForMultipleObjects(). */
>>       const char *where;          /* Where poll_node was created. */
>> +    bool valid;                 /* Can it be used? */
>> +    bool private;               /* Can we assume that it is only in this thread poll loop? */
>>   };
>>   
>> +#define MAX_EPOLL_EVENTS 64
>> +
>>   struct poll_loop {
>> -    /* All active poll waiters. */
>> +    /* List of all poll loops in the system */
>> +    struct ovs_mutex loop_mutex;
>> +    /* All poll waiters for this poll loop */
>>       struct hmap poll_nodes;
>>   
>>       /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
>>        * wake up immediately, or LLONG_MAX to wait forever. */
>>       long long int timeout_when; /* In msecs as returned by time_msec(). */
>>       const char *timeout_where;  /* Where 'timeout_when' was set. */
>> +#ifdef OVS_USE_EPOLL
>> +    int epoll_fd;
>> +    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
>> +#endif
>>   };
>>   
>> +
>>   static struct poll_loop *poll_loop(void);
>>   
>> -/* Look up the node with same fd or wevent. */
>> +/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */
>>   static struct poll_node *
>>   find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>>   {
>> @@ -76,79 +95,142 @@ find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
>>       }
>>       return NULL;
>>   }
>> -
>> -/* On Unix based systems:
>> - *
>> - *     Registers 'fd' as waiting for the specified 'events' (which should be
>> - *     OVS_POLLIN or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to
>> - *     poll_block() will wake up when 'fd' becomes ready for one or more of the
>> - *     requested events. The 'fd's are given to poll() function later.
>> - *
>> - * On Windows system:
>> +/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
>> + * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
>> + * wake up when 'fd' becomes ready for one or more of the requested events.
>>    *
>> - *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
>> - *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
>> - *     it is assumed that it is unrelated to any sockets and poll_block()
>> - *     will wake up on any event on that 'wevent'. It is an error to pass
>> - *     both 'wevent' and 'fd'.
>> + * The event registration is PERSISTENT. This is intended for OSes which have a persistent
>> + * event framework. For now it is implemented only for epoll and Linux, other
>> + * implementations such as BSD kqueue and Solaris /dev/poll may follow.
>>    *
>> - * The event registration is one-shot: only the following call to
>> - * poll_block() is affected.  The event will need to be re-registered after
>> - * poll_block() is called if it is to persist.
>> + * If the OS has no persistent even framework does nothing
>>    *
>>    * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>>    * automatically provide the caller's source file and line number for
>>    * 'where'.) */
>> +
>>   static void
>> -poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
>> +poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)
>>   {
>>       struct poll_loop *loop = poll_loop();
>>       struct poll_node *node;
>> +#ifdef OVS_USE_EPOLL
>> +    struct epoll_event event;
>> +#endif
>>   
>> -    COVERAGE_INC(poll_create_node);
>> -
>> -    /* Both 'fd' and 'wevent' cannot be set. */
>>       ovs_assert(!fd != !wevent);
>>   
>> +    /* This is mostly uncontended, so the thread should grab it straight away.
>> +     * We will reuse it later to introduce threading for IO and SSL
>> +     */
>> +    ovs_mutex_lock(&loop->loop_mutex);
>> +
>>       /* Check for duplicate.  If found, "or" the events. */
>>       node = find_poll_node(loop, fd, wevent);
>> -    if (node) {
>> -        node->pollfd.events |= events;
>> -    } else {
>> -        node = xzalloc(sizeof *node);
>> -        hmap_insert(&loop->poll_nodes, &node->hmap_node,
>> -                    hash_2words(fd, (uint32_t)wevent));
>> -        node->pollfd.fd = fd;
>> -        node->pollfd.events = events;
>> -#ifdef _WIN32
>> -        if (!wevent) {
>> -            wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
>> +
>> +    if (node && node->valid) {
>> +#ifdef OVS_USE_EPOLL
>> +        int old_event_mask = node->pollfd.events;
>> +#endif
>> +        /* If there is an existing event mask we do not need to inc - this will be waited upon */
>> +        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */
>> +
>> +#ifdef OVS_USE_EPOLL
>> +        /* modify existing epoll entry if there is an epoll specific ask or if the
>> +         * mask has changed
>> +         */
>> +        if ((events & 0xFFFF0000) || (old_event_mask != node->pollfd.events)) {
>> +            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;
>> +            event.data.ptr = node;
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
>>           }
>>   #endif
>> +    } else {
>> +        if (!node) {
>> +            node = xzalloc(sizeof *node);
>> +            hmap_insert(&loop->poll_nodes, &node->hmap_node,
>> +                        hash_2words(fd, 0));
>> +        } else {
>> +            /* node marked for reaping, OS has reused the fd number, valid is set to false */
>> +#ifdef OVS_USE_EPOLl
> 
> This should be "#ifdef OVS_USE_EPOLL"
> 
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, fd, NULL);
>> +#endif
>> +        }
>> +        node->pollfd.fd = fd;
>> +        node->pollfd.events = (events & 0x0000FFFF);
>>           node->wevent = wevent;
>>           node->where = where;
>> +        node->valid = true;
>> +        node->private = private;
>> +#ifdef OVS_USE_EPOLL
>> +        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */
>> +        event.data.ptr = node;
>> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
>> +#endif
>> +    }
>> +    if (hint) {
>> +        *hint = &node->pollfd;
>>       }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>> +}
>> +
>> +void
>> +poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {
>> +    poll_fd_subscribe_at(fd, 0, events, hint, where , true);
>> +}
>> +
>> +/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)
>> + * but it is not.
>> + * In order to be compatible with existing calling conventions while using fd persistence
>> + * where supported we have to keep "legacy" fds around for the duration of the life of
>> + * the thread because we have no idea if they have been reaped properly or not.
>> + * The reason for this is that for some of them the close() is in a thread different from the
>> + * poll loop.
>> + * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the
>> + * same fd number, we will reuse the existing has entry.
>> + */
>> +
>> +void
>> +poll_fd_deregister_at(int fd, const char *where) {
>> +    struct poll_loop *loop = poll_loop();
>> +
>> +    VLOG(VLL_DBG, "Deregister %d from %s", fd, where);
>> +    struct poll_node *node;
>> +
>> +    ovs_mutex_lock(&loop->loop_mutex);
>> +    node = find_poll_node(loop, fd, 0);
>> +    if (node) {
>> +        if (node->private) {
>> +#ifdef OVN_USE_EPOLL
> 
> This should be "#ifdef OVS_USE_EPOLL".
> 
>> +            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
>> +#endif
>> +            hmap_remove(&loop->poll_nodes, &node->hmap_node);
>> +        } else {
>> +            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);
>> +            node->valid = false;
>> +        }
>> +    }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>> +}
>> +
>> +void
>> +poll_fd_wait_at(int fd, int events, const char *where)
>> +{
>> +    poll_fd_subscribe_at(fd, 0, events, NULL, where, false);
>>   }
>>   
>> -/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
>> - * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
>> - * wake up when 'fd' becomes ready for one or more of the requested events.
>> - *
>> - * On Windows, 'fd' must be a socket.
>> - *
>> - * The event registration is one-shot: only the following call to poll_block()
>> - * is affected.  The event will need to be re-registered after poll_block() is
>> - * called if it is to persist.
>> - *
>> - * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
>> - * automatically provide the caller's source file and line number for
>> - * 'where'.) */
>>   void
>> -poll_fd_wait_at(int fd, short int events, const char *where)
>> +private_poll_fd_wait_at(int fd, int events, const char *where)
>>   {
>> -    poll_create_node(fd, 0, events, where);
>> +    /* POLLIN persists on "private" fds - either emulated or at epoll
>> +     * or other persistence framework level
>> +     */
>> +    if (events & (~OVS_POLLIN)) {
>> +        poll_fd_subscribe_at(fd, 0, events, NULL, where, true);
>> +    }
>>   }
>>   
>> +
>>   #ifdef _WIN32
>>   /* Registers for the next call to poll_block() to wake up when 'wevent' is
>>    * signaled.
>> @@ -163,7 +245,7 @@ poll_fd_wait_at(int fd, short int events, const char *where)
>>   void
>>   poll_wevent_wait_at(HANDLE wevent, const char *where)
>>   {
>> -    poll_create_node(0, wevent, 0, where);
>> +    poll_fd_subscribe_at(0, wevent, 0, NULL, where);
>>   }
>>   #endif /* _WIN32 */
>>   
>> @@ -277,9 +359,12 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>>           if (pollfd->revents & OVS_POLLHUP) {
>>               ds_put_cstr(&s, "[OVS_POLLHUP]");
>>           }
>> +#ifndef OVS_USE_EPOLL
>> +        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/
>>           if (pollfd->revents & OVS_POLLNVAL) {
>>               ds_put_cstr(&s, "[OVS_POLLNVAL]");
>>           }
>> +#endif
>>           ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
>>           free(description);
>>       } else {
>> @@ -295,12 +380,17 @@ log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
>>       ds_destroy(&s);
>>   }
>>   
>> +
>>   static void
>>   free_poll_nodes(struct poll_loop *loop)
>>   {
>>       struct poll_node *node, *next;
>>   
>> +    ovs_mutex_lock(&loop->loop_mutex);
>>       HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
>> +#ifdef OVS_USE_EPOLL
>> +        epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
>> +#endif
>>           hmap_remove(&loop->poll_nodes, &node->hmap_node);
>>   #ifdef _WIN32
>>           if (node->wevent && node->pollfd.fd) {
>> @@ -310,6 +400,7 @@ free_poll_nodes(struct poll_loop *loop)
>>   #endif
>>           free(node);
>>       }
>> +    ovs_mutex_unlock(&loop->loop_mutex);
>>   }
>>   
>>   /* Blocks until one or more of the events registered with poll_fd_wait()
>> @@ -320,8 +411,13 @@ poll_block(void)
>>   {
>>       struct poll_loop *loop = poll_loop();
>>       struct poll_node *node;
>> +#ifndef OVS_USE_EPOLL
>>       struct pollfd *pollfds;
>> +#endif
>> +#ifndef OVS_USE_EPOLL
>>       HANDLE *wevents = NULL;
>> +    int counter;
>> +#endif
>>       int elapsed;
>>       int retval;
>>       int i;
>> @@ -335,54 +431,126 @@ poll_block(void)
>>       }
>>   
>>       timewarp_run();
>> -    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>>   
>> +#ifdef OVS_USE_EPOLL
>> +    retval = time_epoll_wait(loop->epoll_fd,
>> +        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
>> +    if (retval < 0) {
>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>> +        VLOG_ERR_RL(&rl, "epoll: %s", ovs_strerror(retval));
>> +    } else if (!retval) {
>> +        log_wakeup(loop->timeout_where, NULL, elapsed);
>> +    } else {
>> +        ovs_mutex_lock(&loop->loop_mutex);
>> +        if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> +            for (i = 0; i < retval; i++) {
>> +                node = (struct poll_node *) loop->epoll_events[i].data.ptr;
>> +                if (loop->epoll_events[i].events) {
>> +                    node->pollfd.revents = loop->epoll_events[i].events;
>> +                    log_wakeup(node->where, &node->pollfd, 0);
>> +                }
>> +            }
>> +        }
>> +        for (i = 0; i < retval; i++) {
>> +            node = (struct poll_node *) loop->epoll_events[i].data.ptr;
>> +            if (loop->epoll_events[i].events & EPOLLHUP) {
>> +                /* File descriptor closed already elsewhere
>> +                 * We have to make the assumption that whoever closed it has
>> +                 * ensured that anything which refers to IO event hints will not run
>> +                 * on this fd after we free it.
>> +                 */
>> +                node->valid = false;
>> +            }
>> +            if (loop->epoll_events[i].events) {
>> +                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);
>> +            }
>> +            if (loop->epoll_events[i].events & OVS_POLLOUT) {
>> +                struct epoll_event event;
>> +                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */
>> +                event.events = node->pollfd.events;
>> +                event.data.ptr = node;
>> +                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);
>> +            }
>> +        }
>> +        ovs_mutex_unlock(&loop->loop_mutex);
>> +    }
>> +#else
>> +    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
>>   #ifdef _WIN32
>>       wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
>>   #endif
>>   
>> +
>>       /* Populate with all the fds and events. */
>> -    i = 0;
>> +    counter = 0;
>>       HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
>> -        pollfds[i] = node->pollfd;
>> +        if ((node->valid) && (node->pollfd.events)) {
>> +            pollfds[counter] = node->pollfd;
>>   #ifdef _WIN32
>> -        wevents[i] = node->wevent;
>> -        if (node->pollfd.fd && node->wevent) {
>> -            short int wsa_events = 0;
>> -            if (node->pollfd.events & OVS_POLLIN) {
>> -                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
>> +            wevents[counter] = node->wevent;
>> +            if (node->pollfd.fd && node->wevent) {
>> +                short int wsa_events = 0;
>> +                if (node->pollfd.events & OVS_POLLIN) {
>> +                    wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
>> +                }
>> +                if (node->pollfd.events & OVS_POLLOUT) {
>> +                    wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
>> +                }
>> +                WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>>               }
>> -            if (node->pollfd.events & OVS_POLLOUT) {
>> -                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
>> -            }
>> -            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
>> -        }
>>   #endif
>> -        i++;
>> +            counter++;
>> +        }
>>       }
>>   
>> -    retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
>> +    retval = time_poll(pollfds, counter, wevents,
>>                          loop->timeout_when, &elapsed);
>>       if (retval < 0) {
>>           static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>           VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
>> -    } else if (!retval) {
>> +    } else if (retval == 0) {
>>           log_wakeup(loop->timeout_where, NULL, elapsed);
>> -    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> -        i = 0;
>> -        HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
>> +    } else {
>> +        for (i = 0; i < counter; i++) {
>>               if (pollfds[i].revents) {
>> -                log_wakeup(node->where, &pollfds[i], 0);
>> +
>> +                node = find_poll_node(loop, pollfds[i].fd, 0);
>> +
>> +                if (!node) {
>> +                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
>> +                }
>> +                if (pollfds[i].revents & (OVS_POLLHUP | OVS_POLLNVAL)) {
>> +                    node->valid = false;
>> +                }
>> +
>> +                if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
>> +                    log_wakeup(node->where, &pollfds[i], 0);
>> +                }
>> +                /* update "requested" events.
>> +                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc
>> +                 * behaviour which they should be using in real life instead of using poll()
>> +                 */
>> +                if (node->private) {
>> +                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));
>> +                } else {
>> +                    node->pollfd.events &= ~pollfds[i].revents;
>> +                }
>> +                /* update "occured" events for use by streams and handlers. In case there
>> +                 * is an existing (but not consumed yet) event, we OR the events in the
>> +                 * stored record with the new ones - it is the job of the stream to clear
>> +                 * that.
>> +                 */
>> +                node->pollfd.revents |= pollfds[i].revents;
>>               }
>> -            i++;
>>           }
>>       }
>>   
>> -    free_poll_nodes(loop);
>> +    free(pollfds);
>> +    if (wevents)
>> +        free(wevents);
>> +#endif
>>       loop->timeout_when = LLONG_MAX;
>>       loop->timeout_where = NULL;
>> -    free(pollfds);
>> -    free(wevents);
>>   
>>       /* Handle any pending signals before doing anything else. */
>>       fatal_signal_run();
>> @@ -416,8 +584,12 @@ poll_loop(void)
>>       if (!loop) {
>>           loop = xzalloc(sizeof *loop);
>>           loop->timeout_when = LLONG_MAX;
>> +        ovs_mutex_init(&loop->loop_mutex);
>>           hmap_init(&loop->poll_nodes);
>>           xpthread_setspecific(key, loop);
>> +#ifdef OVS_USE_EPOLL
>> +        loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
>> +#endif
>>       }
>>       return loop;
>>   }
>> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
>> index 3dfa80c7f..16d155989 100644
>> --- a/lib/route-table-bsd.c
>> +++ b/lib/route-table-bsd.c
>> @@ -34,6 +34,7 @@
>>   #include "ovs-router.h"
>>   #include "packets.h"
>>   #include "openvswitch/vlog.h"
>> +#include "openvswitch/poll-loop.h"
>>   #include "util.h"
>>   
>>   VLOG_DEFINE_THIS_MODULE(route_table_bsd);
>> diff --git a/lib/stream-fd.c b/lib/stream-fd.c
>> index 62f768d45..6a80d6e05 100644
>> --- a/lib/stream-fd.c
>> +++ b/lib/stream-fd.c
>> @@ -40,6 +40,8 @@ struct stream_fd
>>       struct stream stream;
>>       int fd;
>>       int fd_type;
>> +    bool rx_ready, tx_ready;
>> +    struct pollfd *hint;
>>   };
>>   
>>   static const struct stream_class stream_fd_class;
>> @@ -67,7 +69,14 @@ new_fd_stream(char *name, int fd, int connect_status, int fd_type,
>>       stream_init(&s->stream, &stream_fd_class, connect_status, name);
>>       s->fd = fd;
>>       s->fd_type = fd_type;
>> +    s->rx_ready = true;
>> +    s->tx_ready = true;
>> +    s->hint = NULL;
>>       *streamp = &s->stream;
>> +    /* Persistent registration - we always get POLLINs from now on,
>> +     * POLLOUTs when we ask for them
>> +     */
>> +    poll_fd_register(s->fd, OVS_POLLIN, &s->hint);
>>       return 0;
>>   }
>>   
>> @@ -82,6 +91,8 @@ static void
>>   fd_close(struct stream *stream)
>>   {
>>       struct stream_fd *s = stream_fd_cast(stream);
>> +    /* Deregister the FD from any persistent registrations if supported */
>> +    poll_fd_deregister(s->fd);
>>       closesocket(s->fd);
>>       free(s);
>>   }
>> @@ -104,6 +115,24 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>>       ssize_t retval;
>>       int error;
>>   
>> +    if (s->hint) {
>> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
>> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
>> +         * for all other cases we belive what (e)poll has fed us.
>> +         */
>> +        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {
>> +            if (!(s->hint->revents & OVS_POLLIN)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready */
>> +                s->rx_ready = true;
>> +                s->hint->revents &= ~OVS_POLLIN;
>> +            }
>> +        } else {
>> +            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
>> +        }
>> +    }
>> +
>>       retval = recv(s->fd, buffer, n, 0);
>>       if (retval < 0) {
>>           error = sock_errno();
>> @@ -114,6 +143,8 @@ fd_recv(struct stream *stream, void *buffer, size_t n)
>>   #endif
>>           if (error != EAGAIN) {
>>               VLOG_DBG_RL(&rl, "recv: %s", sock_strerror(error));
>> +        } else {
>> +            s->rx_ready = false;
>>           }
>>           return -error;
>>       }
>> @@ -127,9 +158,29 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>>       ssize_t retval;
>>       int error;
>>   
>> +    if (s->hint) {
>> +        /* poll-loop is providing us with hints for IO */
>> +        if (!s->tx_ready) {
>> +            if (!(s->hint->revents & OVS_POLLOUT)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLOUT event from poll loop, mark us as ready */
>> +                s->tx_ready = true;
>> +                s->hint->revents &= ~OVS_POLLOUT;
>> +            }
>> +        }
>> +    }
>>       retval = send(s->fd, buffer, n, 0);
>>       if (retval < 0) {
>>           error = sock_errno();
>> +#ifdef __linux__
>> +        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen
>> +         *  on unix domain sockets
>> +         */
>> +        if (error == ENOBUFS) {
>> +           error = EAGAIN;
>> +        }
>> +#endif
>>   #ifdef _WIN32
>>           if (error == WSAEWOULDBLOCK) {
>>              error = EAGAIN;
>> @@ -137,6 +188,8 @@ fd_send(struct stream *stream, const void *buffer, size_t n)
>>   #endif
>>           if (error != EAGAIN) {
>>               VLOG_DBG_RL(&rl, "send: %s", sock_strerror(error));
>> +        } else {
>> +            s->tx_ready = false;
>>           }
>>           return -error;
>>       }
>> @@ -150,11 +203,11 @@ fd_wait(struct stream *stream, enum stream_wait_type wait)
>>       switch (wait) {
>>       case STREAM_CONNECT:
>>       case STREAM_SEND:
>> -        poll_fd_wait(s->fd, OVS_POLLOUT);
>> +        private_poll_fd_wait(s->fd, OVS_POLLOUT);
>>           break;
>>   
>>       case STREAM_RECV:
>> -        poll_fd_wait(s->fd, OVS_POLLIN);
>> +        private_poll_fd_wait(s->fd, OVS_POLLIN);
>>           break;
>>   
>>       default:
>> @@ -223,6 +276,8 @@ new_fd_pstream(char *name, int fd,
>>       ps->accept_cb = accept_cb;
>>       ps->unlink_path = unlink_path;
>>       *pstreamp = &ps->pstream;
>> +    /* persistent registration */
>> +    poll_fd_register(ps->fd, OVS_POLLIN, NULL);
>>       return 0;
>>   }
>>   
>> @@ -230,6 +285,7 @@ static void
>>   pfd_close(struct pstream *pstream)
>>   {
>>       struct fd_pstream *ps = fd_pstream_cast(pstream);
>> +    poll_fd_deregister(ps->fd);
>>       closesocket(ps->fd);
>>       maybe_unlink_and_free(ps->unlink_path);
>>       free(ps);
>> @@ -271,7 +327,7 @@ static void
>>   pfd_wait(struct pstream *pstream)
>>   {
>>       struct fd_pstream *ps = fd_pstream_cast(pstream);
>> -    poll_fd_wait(ps->fd, OVS_POLLIN);
>> +    private_poll_fd_wait(ps->fd, OVS_POLLIN);
>>   }
>>   
>>   static const struct pstream_class fd_pstream_class = {
>> diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
>> index 3b7f9865e..53ae51c1b 100644
>> --- a/lib/stream-ssl.c
>> +++ b/lib/stream-ssl.c
>> @@ -147,6 +147,7 @@ struct ssl_stream
>>       /* A few bytes of header data in case SSL negotiation fails. */
>>       uint8_t head[2];
>>       short int n_head;
>> +    struct pollfd *hint;
>>   };
>>   
>>   /* SSL context created by ssl_init(). */
>> @@ -310,6 +311,8 @@ new_ssl_stream(char *name, char *server_name, int fd, enum session_type type,
>>           SSL_set_msg_callback_arg(ssl, sslv);
>>       }
>>   
>> +
>> +    poll_fd_register(sslv->fd, OVS_POLLIN, &sslv->hint);
>>       *streamp = &sslv->stream;
>>       free(server_name);
>>       return 0;
>> @@ -604,6 +607,7 @@ ssl_close(struct stream *stream)
>>       ERR_clear_error();
>>   
>>       SSL_free(sslv->ssl);
>> +    poll_fd_deregister(sslv->fd);
>>       closesocket(sslv->fd);
>>       free(sslv);
>>   }
>> @@ -697,6 +701,27 @@ ssl_recv(struct stream *stream, void *buffer, size_t n)
>>       /* Behavior of zero-byte SSL_read is poorly defined. */
>>       ovs_assert(n > 0);
>>   
>> +     if (sslv->hint) {
>> +        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
>> +         * to the read which should return 0 if the HUP is a real one, if not we clear it
>> +         * for all other cases we belive what (e)poll has fed us.
>> +         */
>> +        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {
>> +            if (!(sslv->hint->revents & OVS_POLLIN)) {
>> +                return -EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready
>> +                 * rx_want is cleared further down by reading ssl fsm
>> +                 */
>> +                sslv->hint->revents &= ~OVS_POLLIN;
>> +            }
>> +        } else {
>> +            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
>> +        }
>> +    }
>> +
>> +
>> +
>>       old_state = SSL_get_state(sslv->ssl);
>>       ret = SSL_read(sslv->ssl, buffer, n);
>>       if (old_state != SSL_get_state(sslv->ssl)) {
>> @@ -729,6 +754,19 @@ ssl_do_tx(struct stream *stream)
>>   {
>>       struct ssl_stream *sslv = ssl_stream_cast(stream);
>>   
>> +     if (sslv->hint) {
>> +        /* poll-loop is providing us with hints for IO */
>> +        if (sslv->tx_want == SSL_WRITING) {
>> +            if (!(sslv->hint->revents & OVS_POLLOUT)) {
>> +                return EAGAIN;
>> +            } else {
>> +                /* POLLIN event from poll loop, mark us as ready
>> +                 * rx_want is cleared further down by reading ssl fsm
>> +                 */
>> +                sslv->hint->revents &= ~OVS_POLLOUT;
>> +            }
>> +        }
>> +    }
>>       for (;;) {
>>           int old_state = SSL_get_state(sslv->ssl);
>>           int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
>> @@ -771,6 +809,8 @@ ssl_send(struct stream *stream, const void *buffer, size_t n)
>>               ssl_clear_txbuf(sslv);
>>               return n;
>>           case EAGAIN:
>> +            /* we want to know when this fd will become available again */
>> +            stream_send_wait(stream);
>>               return n;
>>           default:
>>               ssl_clear_txbuf(sslv);
>> @@ -795,7 +835,7 @@ ssl_run_wait(struct stream *stream)
>>       struct ssl_stream *sslv = ssl_stream_cast(stream);
>>   
>>       if (sslv->tx_want != SSL_NOTHING) {
>> -        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>> +        private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
>>       }
>>   }
>>   
>> @@ -811,13 +851,13 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>>           } else {
>>               switch (sslv->state) {
>>               case STATE_TCP_CONNECTING:
>> -                poll_fd_wait(sslv->fd, OVS_POLLOUT);
>> +                private_poll_fd_wait(sslv->fd, OVS_POLLOUT);
>>                   break;
>>   
>>               case STATE_SSL_CONNECTING:
>>                   /* ssl_connect() called SSL_accept() or SSL_connect(), which
>>                    * set up the status that we test here. */
>> -                poll_fd_wait(sslv->fd,
>> +                private_poll_fd_wait(sslv->fd,
>>                                  want_to_poll_events(SSL_want(sslv->ssl)));
>>                   break;
>>   
>> @@ -829,7 +869,7 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
>>   
>>       case STREAM_RECV:
>>           if (sslv->rx_want != SSL_NOTHING) {
>> -            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>> +            private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
>>           } else {
>>               poll_immediate_wake();
>>           }
>> @@ -911,6 +951,7 @@ pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
>>                    ds_steal_cstr(&bound_name));
>>       pstream_set_bound_port(&pssl->pstream, htons(port));
>>       pssl->fd = fd;
>> +    poll_fd_register(fd, OVS_POLLIN, NULL);
>>       *pstreamp = &pssl->pstream;
>>   
>>       return 0;
>> @@ -920,6 +961,7 @@ static void
>>   pssl_close(struct pstream *pstream)
>>   {
>>       struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
>> +    poll_fd_deregister(pssl->fd);
>>       closesocket(pssl->fd);
>>       free(pssl);
>>   }
>> diff --git a/lib/timeval.c b/lib/timeval.c
>> index 193c7bab1..59a12414f 100644
>> --- a/lib/timeval.c
>> +++ b/lib/timeval.c
>> @@ -38,6 +38,7 @@
>>   #include "unixctl.h"
>>   #include "util.h"
>>   #include "openvswitch/vlog.h"
>> +#include "openvswitch/poll-loop.h"
>>   
>>   VLOG_DEFINE_THIS_MODULE(timeval);
>>   
>> @@ -369,6 +370,88 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED,
>>       return retval;
>>   }
>>   
>> +#ifdef OVS_USE_EPOLL
>> +
>> +/* Like epoll_wait(), except:
>> + *
>> + *      - The timeout is specified as an absolute time, as defined by
>> + *        time_msec(), instead of a duration.
>> + *
>> + *      - On error, returns a negative error code (instead of setting errno).
>> + *
>> + *      - If interrupted by a signal, retries automatically until the original
>> + *        timeout is reached.  (Because of this property, this function will
>> + *        never return -EINTR.)
>> + *
>> + * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
>> +int
>> +time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
>> +          long long int timeout_when, int *elapsed)
>> +{
>> +    long long int *last_wakeup = last_wakeup_get();
>> +    long long int start;
>> +    bool quiescent;
>> +    int retval = 0;
>> +
>> +    time_init();
>> +    coverage_clear();
>> +    coverage_run();
>> +    if (*last_wakeup && !thread_is_pmd()) {
>> +        log_poll_interval(*last_wakeup);
>> +    }
>> +    start = time_msec();
>> +
>> +    timeout_when = MIN(timeout_when, deadline);
>> +    quiescent = ovsrcu_is_quiescent();
>> +
>> +    for (;;) {
>> +        long long int now = time_msec();
>> +        int time_left;
>> +
>> +        if (now >= timeout_when) {
>> +            time_left = 0;
>> +        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
>> +            time_left = INT_MAX;
>> +        } else {
>> +            time_left = timeout_when - now;
>> +        }
>> +
>> +        if (!quiescent) {
>> +            if (!time_left) {
>> +                ovsrcu_quiesce();
>> +            } else {
>> +                ovsrcu_quiesce_start();
>> +            }
>> +        }
>> +
>> +        retval = epoll_wait(epoll_fd, events, max, time_left);
>> +        if (retval < 0) {
>> +            retval = -errno;
>> +        }
>> +
>> +        if (!quiescent && time_left) {
>> +            ovsrcu_quiesce_end();
>> +        }
>> +
>> +        if (deadline <= time_msec()) {
>> +            fatal_signal_handler(SIGALRM);
>> +            if (retval < 0) {
>> +                retval = 0;
>> +            }
>> +            break;
>> +        }
>> +
>> +        if (retval != -EINTR) {
>> +            break;
>> +        }
>> +    }
>> +    *last_wakeup = time_msec();
>> +    refresh_rusage();
>> +    *elapsed = *last_wakeup - start;
>> +    return retval;
>> +}
>> +#endif
>> +
>>   long long int
>>   timespec_to_msec(const struct timespec *ts)
>>   {
>> diff --git a/lib/timeval.h b/lib/timeval.h
>> index 502f703d4..347a09d63 100644
>> --- a/lib/timeval.h
>> +++ b/lib/timeval.h
>> @@ -20,6 +20,9 @@
>>   #include <time.h>
>>   #include "openvswitch/type-props.h"
>>   #include "util.h"
>> +#ifdef __linux__
>> +#include <sys/epoll.h>
>> +#endif
>>   
>>   #ifdef  __cplusplus
>>   extern "C" {
>> @@ -61,6 +64,10 @@ void time_wall_timespec(struct timespec *);
>>   void time_alarm(unsigned int secs);
>>   int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
>>                 long long int timeout_when, int *elapsed);
>> +#ifdef __linux__
>> +int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
>> +          long long int timeout_when, int *elapsed);
>> +#endif
>>   
>>   long long int timespec_to_msec(const struct timespec *);
>>   long long int timespec_to_usec(const struct timespec *);
>>
> 
>
Dumitru Ceara Feb. 19, 2020, 2:20 p.m. UTC | #6
On 2/18/20 7:12 AM, Anton Ivanov wrote:
> 
> 
> On 17/02/2020 14:48, Dumitru Ceara wrote:
>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>
>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>> These are waited upon in the same thread where they are created. This
>>> allows them to be registered persistently with the OS (if possible)
>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>> etc.
>>>
>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>> which is not ready if that fd has been registered as "private" to the
>>> thread which waits upon it.
>>>
>>> 3. No longer breaks other parts of OVS which create the fd in one
>>> thread and waits upon it in others.
>>>
>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>> poll++ frameworks in other OSes.
>>>
>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>
>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> Hi Anton,
>>
>> A couple of issues inline. Except for that:
>>
>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>> failing with your patches applied:
>>
>> make check TESTSUITEFLAGS='-k "flush the fdb"'
> 
> I will have a look.
> 
>>
>> 2. Travis CI build fails:
>>
>> lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
>> declared with attribute warn_unused_result [-Werror=unused-result]
>>
>>       read(signal_fds[0], sigbuffer, sizeof(sigbuffer))
>>
>> 3. Travis CI OSX build fails:
>>
>> lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
>> [-Werror,-Wunused-function]
>>
>> COVERAGE_DEFINE(poll_create_node);
> 
> I will fix all of these in the next version. The CI was out with the
> daisies when I submitted the patch last week so I did not see the logs
> until yesterday.
> 
>>
>> 4. While OVS might benefit from these changes I'm wondering about OVN
>> and ovsdb-server specifically. ovsdb-server is single threaded and
>> usually on large scale deployments we don't really see "poll" as the
>> bottleneck or even the fact that code tries to read/write from FDs when
>> FDs are not available for read/write.
>>
>> For example, here are results of running a scale test scenario which
>> repeats the following iteration 300 times:
>> - bring up a node (ovn-fake-multinode container) and connect it to the
>> OVN Southbound DB.
>> - configure an OVN logical switch to be bound to the new node.
>> - configure an OVN logical switch port on the new logical switch.
>> - configure an OVS internal interface on the new node and bind it to the
>> OVN logical switch port.
>> - wait until the new internal interface can ping its default gateway
>> through OVN (i.e., until ovn-controller on the node received all updates
>> from the SB DB and installed all OVS flows), highlighted in the output.
>>
>> The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
>> running OVN ovsdb-servers and ovn-northd and 8 machines simulating
>> chassis using ovn-fake-multinode), in particular this modified scenario:
>> https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json
>>
>>
>> With OVS master and OVN master:
>> http://pastebin.test.redhat.com/836568
>>
>> With OVS master + your patches and OVN master:
>> http://pastebin.test.redhat.com/836571
>>
>> Here are some of the logs we get on the OVN Southbound DB ovsdb-server
>> that show that ovsdb-server spends up to 2 seconds in a single loop
>> iteration sending/receiving updates to/from ovn-controllers:
>>
>> 2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>> on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
>> (84% CPU usage)
>> 2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
>> poll interval (2144ms user, 9ms system)
>> 2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
>> 2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
>> 2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
>> voluntary, 4 involuntary
>> 2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
>> last 2 seconds (most recently, 2 seconds ago) due to excessive rate
>> 2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>> on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
>> (84% CPU usage)
>> 2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
>> poll interval (2129ms user, 17ms system)
>> 2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
>> 2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
>> voluntary, 7 involuntary
>> 2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
>> poll interval (2136ms user, 10ms system)
>>
>> In this case, and I think in most OVN use cases, ovsdb-server is busy
>> because it actually has to send updates to large numbers of
>> ovn-controllers connected to it. Unless I'm missing something the epoll
>> change seems to improve performance only in cases where the Southbound
>> DB doesn't do much sending/receiving. 
> 
> 1. Correct - it improves the handling and the cost of idle connections.
> At present each connection is a fixed cost regardless of does it need
> servicing or not.
> 
> 2. It should also improve the cost of handling of many peers to send if
> there is enough outstanding data on sockets to create EGAIN on send.
> 
> 3. It does not fix the fundamental problem that the logic in ovsdb is
> single threaded. It does, however allow json+io+ssl to become
> multi-threaded which should leave just the logic in the main ovsd-server
> thread. You cannot do that effectively without having information on the
> state of the socket and does it need servicing.
> 
> 4. I have seen the rally tests - they are flat scale-up. I agree that
> the help from fixing the IO loop will be minimal because there is IO on
> most fds to be serviced at all times. I would not expect them to help a
> lot there.
> 
> The difference from fixing the IO (leaving aside that it is a
> prerequisite to getting SSL to worker threads) is not in scaling up, but
> running at scale. The current "ram the EGAIN wall until it gives up"
> design gives you a mandatory penalty per transaction deploying config to
> one node while running (because of attempted failed reads on EAGAIN).
> The correct test for that is the CPU cost and latency to deploy a
> logical flow when running at steady state in f.e. 300 nodes.

Currently when a logical flow is added to the SB DB for a logical
datapath (logical switch or logical router) all ovn-controllers that
consider the logical datapath as "local" will receive the update from
ovsdb SB.

ovn-controller local datapaths are:
- all logical switch datapaths that have ports bound to the local chassis.
- all logical datapaths corresponding to logical routers connected
through OVN patch ports to local logical switches.

While it's true that in some deployments this will limit the number of
ovn-controllers receiving the update, in OVN-k8s, for example, all node
logical switches are connected to a single cluster logical router. Once
a logical flow is added to any of the node logical switches all
ovn-controllers on all nodes should receive the update. This means that
there won't be too many idle connections during a SB DB logical flow update.

I agree that the right way to test is to see how long it takes for a
logical flow to be installed, which is what the rally-ovs tests do: they
measure how long until the SB DB logical flow is translated to openflow
by ovn-controller when a new node is brought up. At iteration X+1 we are
at a steady state (X nodes up and configured) and node X+1 is brought up
and its corresponding logical flows are added.

> 
> The current design as written has that penalty as a given - it is
> unavoidable and it also grows linear (or worse) with size from other
> factors in addition to EGAIN - f.e you also start copying a large pollfd
> array to the kernel back and fourth on every iteration.
> 
> I have not seen any tests trying to quantify this on a actual cluster.
> All tests I have seen so far are scale-up and/or running a single
> synthetic application on the whole cluster.
> 
> Mine are mostly taking the code in question and running it outside of
> OVS/OVN on a harness.
> 
> 
>> How do you test
>> performance/scalability improvements?
>>
>> Regards,
>> Dumitru
>>
Anton Ivanov Feb. 19, 2020, 5:20 p.m. UTC | #7
On 19/02/2020 14:20, Dumitru Ceara wrote:
> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>
>>
>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>
>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>> These are waited upon in the same thread where they are created. This
>>>> allows them to be registered persistently with the OS (if possible)
>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>> etc.
>>>>
>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>> which is not ready if that fd has been registered as "private" to the
>>>> thread which waits upon it.
>>>>
>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>> thread and waits upon it in others.
>>>>
>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>> poll++ frameworks in other OSes.
>>>>
>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>
>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>
>>> Hi Anton,
>>>
>>> A couple of issues inline. Except for that:
>>>
>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>> failing with your patches applied:
>>>
>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>>
>> I will have a look.
>>
>>>
>>> 2. Travis CI build fails:
>>>
>>> lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
>>> declared with attribute warn_unused_result [-Werror=unused-result]
>>>
>>>        read(signal_fds[0], sigbuffer, sizeof(sigbuffer))
>>>
>>> 3. Travis CI OSX build fails:
>>>
>>> lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
>>> [-Werror,-Wunused-function]
>>>
>>> COVERAGE_DEFINE(poll_create_node);
>>
>> I will fix all of these in the next version. The CI was out with the
>> daisies when I submitted the patch last week so I did not see the logs
>> until yesterday.
>>
>>>
>>> 4. While OVS might benefit from these changes I'm wondering about OVN
>>> and ovsdb-server specifically. ovsdb-server is single threaded and
>>> usually on large scale deployments we don't really see "poll" as the
>>> bottleneck or even the fact that code tries to read/write from FDs when
>>> FDs are not available for read/write.
>>>
>>> For example, here are results of running a scale test scenario which
>>> repeats the following iteration 300 times:
>>> - bring up a node (ovn-fake-multinode container) and connect it to the
>>> OVN Southbound DB.
>>> - configure an OVN logical switch to be bound to the new node.
>>> - configure an OVN logical switch port on the new logical switch.
>>> - configure an OVS internal interface on the new node and bind it to the
>>> OVN logical switch port.
>>> - wait until the new internal interface can ping its default gateway
>>> through OVN (i.e., until ovn-controller on the node received all updates
>>> from the SB DB and installed all OVS flows), highlighted in the output.
>>>
>>> The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
>>> running OVN ovsdb-servers and ovn-northd and 8 machines simulating
>>> chassis using ovn-fake-multinode), in particular this modified scenario:
>>> https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json
>>>
>>>
>>> With OVS master and OVN master:
>>> http://pastebin.test.redhat.com/836568
>>>
>>> With OVS master + your patches and OVN master:
>>> http://pastebin.test.redhat.com/836571
>>>
>>> Here are some of the logs we get on the OVN Southbound DB ovsdb-server
>>> that show that ovsdb-server spends up to 2 seconds in a single loop
>>> iteration sending/receiving updates to/from ovn-controllers:
>>>
>>> 2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>>> on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
>>> (84% CPU usage)
>>> 2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
>>> poll interval (2144ms user, 9ms system)
>>> 2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
>>> 2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
>>> 2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
>>> voluntary, 4 involuntary
>>> 2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
>>> last 2 seconds (most recently, 2 seconds ago) due to excessive rate
>>> 2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>>> on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
>>> (84% CPU usage)
>>> 2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
>>> poll interval (2129ms user, 17ms system)
>>> 2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
>>> 2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
>>> voluntary, 7 involuntary
>>> 2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
>>> poll interval (2136ms user, 10ms system)
>>>
>>> In this case, and I think in most OVN use cases, ovsdb-server is busy
>>> because it actually has to send updates to large numbers of
>>> ovn-controllers connected to it. Unless I'm missing something the epoll
>>> change seems to improve performance only in cases where the Southbound
>>> DB doesn't do much sending/receiving.
>>
>> 1. Correct - it improves the handling and the cost of idle connections.
>> At present each connection is a fixed cost regardless of does it need
>> servicing or not.
>>
>> 2. It should also improve the cost of handling of many peers to send if
>> there is enough outstanding data on sockets to create EGAIN on send.
>>
>> 3. It does not fix the fundamental problem that the logic in ovsdb is
>> single threaded. It does, however allow json+io+ssl to become
>> multi-threaded which should leave just the logic in the main ovsd-server
>> thread. You cannot do that effectively without having information on the
>> state of the socket and does it need servicing.
>>
>> 4. I have seen the rally tests - they are flat scale-up. I agree that
>> the help from fixing the IO loop will be minimal because there is IO on
>> most fds to be serviced at all times. I would not expect them to help a
>> lot there.
>>
>> The difference from fixing the IO (leaving aside that it is a
>> prerequisite to getting SSL to worker threads) is not in scaling up, but
>> running at scale. The current "ram the EGAIN wall until it gives up"
>> design gives you a mandatory penalty per transaction deploying config to
>> one node while running (because of attempted failed reads on EAGAIN).
>> The correct test for that is the CPU cost and latency to deploy a
>> logical flow when running at steady state in f.e. 300 nodes.
> 
> Currently when a logical flow is added to the SB DB for a logical
> datapath (logical switch or logical router) all ovn-controllers that
> consider the logical datapath as "local" will receive the update from
> ovsdb SB.
> 
> ovn-controller local datapaths are:
> - all logical switch datapaths that have ports bound to the local chassis.
> - all logical datapaths corresponding to logical routers connected
> through OVN patch ports to local logical switches.
> 
> While it's true that in some deployments this will limit the number of
> ovn-controllers receiving the update, in OVN-k8s, for example, all node
> logical switches are connected to a single cluster logical router. Once
> a logical flow is added to any of the node logical switches all
> ovn-controllers on all nodes should receive the update. This means that
> there won't be too many idle connections during a SB DB logical flow update.

The example you are giving is a clear case of parallel IO for which you 
need a working IO event loop and retained state on where are you at 
present with IO for each connection. You cannot effectively parallellise 
IO+SSL while discarding all information about it.

Fixing the IO events is a prerequisite for doing other stuff as well - 
SSL threading, processing threading, etc. Sure you can try parallelising 
IO without any information on what is your IO state and tie that into 
some level of parallelism in the processing. That usually results in 
something that resembles spaghetti and is nightmare to continue scaling 
further.

> 
> I agree that the right way to test is to see how long it takes for a
> logical flow to be installed, which is what the rally-ovs tests do: they
> measure how long until the SB DB logical flow is translated to openflow
> by ovn-controller when a new node is brought up. At iteration X+1 we are
> at a steady state (X nodes up and configured) and node X+1 is brought up
> and its corresponding logical flows are added.
> 
>>
>> The current design as written has that penalty as a given - it is
>> unavoidable and it also grows linear (or worse) with size from other
>> factors in addition to EGAIN - f.e you also start copying a large pollfd
>> array to the kernel back and fourth on every iteration.
>>
>> I have not seen any tests trying to quantify this on a actual cluster.
>> All tests I have seen so far are scale-up and/or running a single
>> synthetic application on the whole cluster.
>>
>> Mine are mostly taking the code in question and running it outside of
>> OVS/OVN on a harness.
>>
>>
>>> How do you test
>>> performance/scalability improvements?
>>>
>>> Regards,
>>> Dumitru
>>>
> 
>
Anton Ivanov Feb. 24, 2020, 10:47 a.m. UTC | #8
On 19/02/2020 14:20, Dumitru Ceara wrote:
> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>
>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>
>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>> These are waited upon in the same thread where they are created. This
>>>> allows them to be registered persistently with the OS (if possible)
>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>> etc.
>>>>
>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>> which is not ready if that fd has been registered as "private" to the
>>>> thread which waits upon it.
>>>>
>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>> thread and waits upon it in others.
>>>>
>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>> poll++ frameworks in other OSes.
>>>>
>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>
>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>> Hi Anton,
>>>
>>> A couple of issues inline. Except for that:
>>>
>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>> failing with your patches applied:
>>>
>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>> I will have a look.


I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?


>>
>>> 2. Travis CI build fails:
>>>
>>> lib/fatal-signal.c:244:5: error: ignoring return value of ‘read’,
>>> declared with attribute warn_unused_result [-Werror=unused-result]
>>>
>>>        read(signal_fds[0], sigbuffer, sizeof(sigbuffer))
>>>
>>> 3. Travis CI OSX build fails:
>>>
>>> lib/poll-loop.c:46:1: error: unused function 'poll_create_node_add'
>>> [-Werror,-Wunused-function]
>>>
>>> COVERAGE_DEFINE(poll_create_node);
>> I will fix all of these in the next version. The CI was out with the
>> daisies when I submitted the patch last week so I did not see the logs
>> until yesterday.
>>
>>> 4. While OVS might benefit from these changes I'm wondering about OVN
>>> and ovsdb-server specifically. ovsdb-server is single threaded and
>>> usually on large scale deployments we don't really see "poll" as the
>>> bottleneck or even the fact that code tries to read/write from FDs when
>>> FDs are not available for read/write.
>>>
>>> For example, here are results of running a scale test scenario which
>>> repeats the following iteration 300 times:
>>> - bring up a node (ovn-fake-multinode container) and connect it to the
>>> OVN Southbound DB.
>>> - configure an OVN logical switch to be bound to the new node.
>>> - configure an OVN logical switch port on the new logical switch.
>>> - configure an OVS internal interface on the new node and bind it to the
>>> OVN logical switch port.
>>> - wait until the new internal interface can ping its default gateway
>>> through OVN (i.e., until ovn-controller on the node received all updates
>>> from the SB DB and installed all OVS flows), highlighted in the output.
>>>
>>> The tests use rally-ovs (ovn-scale-test) on a 9 server setup (1 machine
>>> running OVN ovsdb-servers and ovn-northd and 8 machines simulating
>>> chassis using ovn-fake-multinode), in particular this modified scenario:
>>> https://github.com/dceara/ovn-scale-test/blob/ovn-switch-per-node/samples/tasks/scenarios/ovn-network/osh_workload_incremental.json
>>>
>>>
>>> With OVS master and OVN master:
>>> http://pastebin.test.redhat.com/836568
>>>
>>> With OVS master + your patches and OVN master:
>>> http://pastebin.test.redhat.com/836571
>>>
>>> Here are some of the logs we get on the OVN Southbound DB ovsdb-server
>>> that show that ovsdb-server spends up to 2 seconds in a single loop
>>> iteration sending/receiving updates to/from ovn-controllers:
>>>
>>> 2020-02-17T10:43:41.175Z|01991|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>>> on fd 140 (192.16.0.1:6642<->192.16.0.120:52018) at lib/stream-fd.c:79
>>> (84% CPU usage)
>>> 2020-02-17T10:43:43.338Z|01992|timeval|WARN|Unreasonably long 2163ms
>>> poll interval (2144ms user, 9ms system)
>>> 2020-02-17T10:43:43.339Z|01993|timeval|WARN|faults: 590 minor, 0 major
>>> 2020-02-17T10:43:43.339Z|01994|timeval|WARN|disk: 0 reads, 8 writes
>>> 2020-02-17T10:43:43.339Z|01995|timeval|WARN|context switches: 0
>>> voluntary, 4 involuntary
>>> 2020-02-17T10:43:43.339Z|01996|poll_loop|INFO|Dropped 63 log messages in
>>> last 2 seconds (most recently, 2 seconds ago) due to excessive rate
>>> 2020-02-17T10:43:43.339Z|01997|poll_loop|INFO|wakeup due to [OVS_POLLIN]
>>> on fd 76 (192.16.0.1:6642<->192.16.0.56:33538) at lib/stream-fd.c:79
>>> (84% CPU usage)
>>> 2020-02-17T10:43:45.495Z|01998|timeval|WARN|Unreasonably long 2156ms
>>> poll interval (2129ms user, 17ms system)
>>> 2020-02-17T10:43:45.495Z|01999|timeval|WARN|faults: 738 minor, 0 major
>>> 2020-02-17T10:43:45.495Z|02000|timeval|WARN|context switches: 0
>>> voluntary, 7 involuntary
>>> 2020-02-17T10:43:47.651Z|02001|timeval|WARN|Unreasonably long 2157ms
>>> poll interval (2136ms user, 10ms system)
>>>
>>> In this case, and I think in most OVN use cases, ovsdb-server is busy
>>> because it actually has to send updates to large numbers of
>>> ovn-controllers connected to it. Unless I'm missing something the epoll
>>> change seems to improve performance only in cases where the Southbound
>>> DB doesn't do much sending/receiving.
>> 1. Correct - it improves the handling and the cost of idle connections.
>> At present each connection is a fixed cost regardless of does it need
>> servicing or not.
>>
>> 2. It should also improve the cost of handling of many peers to send if
>> there is enough outstanding data on sockets to create EGAIN on send.
>>
>> 3. It does not fix the fundamental problem that the logic in ovsdb is
>> single threaded. It does, however allow json+io+ssl to become
>> multi-threaded which should leave just the logic in the main ovsd-server
>> thread. You cannot do that effectively without having information on the
>> state of the socket and does it need servicing.
>>
>> 4. I have seen the rally tests - they are flat scale-up. I agree that
>> the help from fixing the IO loop will be minimal because there is IO on
>> most fds to be serviced at all times. I would not expect them to help a
>> lot there.
>>
>> The difference from fixing the IO (leaving aside that it is a
>> prerequisite to getting SSL to worker threads) is not in scaling up, but
>> running at scale. The current "ram the EGAIN wall until it gives up"
>> design gives you a mandatory penalty per transaction deploying config to
>> one node while running (because of attempted failed reads on EAGAIN).
>> The correct test for that is the CPU cost and latency to deploy a
>> logical flow when running at steady state in f.e. 300 nodes.
> Currently when a logical flow is added to the SB DB for a logical
> datapath (logical switch or logical router) all ovn-controllers that
> consider the logical datapath as "local" will receive the update from
> ovsdb SB.
>
> ovn-controller local datapaths are:
> - all logical switch datapaths that have ports bound to the local chassis.
> - all logical datapaths corresponding to logical routers connected
> through OVN patch ports to local logical switches.
>
> While it's true that in some deployments this will limit the number of
> ovn-controllers receiving the update, in OVN-k8s, for example, all node
> logical switches are connected to a single cluster logical router. Once
> a logical flow is added to any of the node logical switches all
> ovn-controllers on all nodes should receive the update. This means that
> there won't be too many idle connections during a SB DB logical flow update.
>
> I agree that the right way to test is to see how long it takes for a
> logical flow to be installed, which is what the rally-ovs tests do: they
> measure how long until the SB DB logical flow is translated to openflow
> by ovn-controller when a new node is brought up. At iteration X+1 we are
> at a steady state (X nodes up and configured) and node X+1 is brought up
> and its corresponding logical flows are added.
>
>> The current design as written has that penalty as a given - it is
>> unavoidable and it also grows linear (or worse) with size from other
>> factors in addition to EGAIN - f.e you also start copying a large pollfd
>> array to the kernel back and fourth on every iteration.
>>
>> I have not seen any tests trying to quantify this on a actual cluster.
>> All tests I have seen so far are scale-up and/or running a single
>> synthetic application on the whole cluster.
>>
>> Mine are mostly taking the code in question and running it outside of
>> OVS/OVN on a harness.
>>
>>
>>> How do you test
>>> performance/scalability improvements?
>>>
>>> Regards,
>>> Dumitru
>>>
>
Dumitru Ceara Feb. 24, 2020, 12:58 p.m. UTC | #9
On Mon, Feb 24, 2020 at 11:47 AM Anton Ivanov
<anton.ivanov@cambridgegreys.com> wrote:
>
>
> On 19/02/2020 14:20, Dumitru Ceara wrote:
> > On 2/18/20 7:12 AM, Anton Ivanov wrote:
> >>
> >> On 17/02/2020 14:48, Dumitru Ceara wrote:
> >>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
> >>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> >>>>
> >>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
> >>>> These are waited upon in the same thread where they are created. This
> >>>> allows them to be registered persistently with the OS (if possible)
> >>>> as well as the OS to provide hints - is the FD ready, is it closed,
> >>>> etc.
> >>>>
> >>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
> >>>> which is not ready if that fd has been registered as "private" to the
> >>>> thread which waits upon it.
> >>>>
> >>>> 3. No longer breaks other parts of OVS which create the fd in one
> >>>> thread and waits upon it in others.
> >>>>
> >>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
> >>>> poll++ frameworks in other OSes.
> >>>>
> >>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
> >>>> using a "centeral (e)poll dispatcher + IO threads" pattern
> >>>>
> >>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> >>> Hi Anton,
> >>>
> >>> A couple of issues inline. Except for that:
> >>>
> >>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
> >>> failing with your patches applied:
> >>>
> >>> make check TESTSUITEFLAGS='-k "flush the fdb"'
> >> I will have a look.
>
>
> I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?
>
>

Hi Anton,

After rebasing to latest upstream master I see the failure less often.
But it's still there.

I pasted the testsuite below.

Regards,
Dumitru

$ cat tests/testsuite.dir/2185/testsuite.log
#                             -*- compilation -*-
2185. stp.at:467: testing STP - flush the fdb and mdb when topology changed ...
./stp.at:468: ovsdb-tool create conf.db
$abs_top_srcdir/vswitchd/vswitch.ovsschema
./stp.at:468: ovsdb-server --detach --no-chdir --pidfile --log-file
--remote=punix:$OVS_RUNDIR/db.sock
stderr:
2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
./stp.at:468: sed < stderr '
/vlog|INFO|opened log file/d
/ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d'
./stp.at:468: ovs-vsctl --no-wait init
./stp.at:468: ovs-vswitchd --enable-dummy --disable-system
--disable-system-route  --detach --no-chdir --pidfile --log-file
-vvconn -vofproto_dpif -vunixctl
stderr:
2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
2020-02-24T12:56:14Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
2020-02-24T12:56:14Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
2020-02-24T12:56:14Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
connecting...
2020-02-24T12:56:14Z|00005|netlink_socket|INFO|netlink: could not
enable listening to all nsid (Operation not permitted)
2020-02-24T12:56:14Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
connected
./stp.at:468: sed < stderr '
/ovs_numa|INFO|Discovered /d
/vlog|INFO|opened log file/d
/vswitchd|INFO|ovs-vswitchd (Open vSwitch)/d
/reconnect|INFO|/d
/dpif_netlink|INFO|Generic Netlink family .ovs_datapath. does not exist/d
/ofproto|INFO|using datapath ID/d
/netdev_linux|INFO|.*device has unknown hardware address family/d
/ofproto|INFO|datapath ID changed to fedcba9876543210/d
/dpdk|INFO|DPDK Disabled - Use other_config:dpdk-init to enable/d
/netlink_socket|INFO|netlink: could not enable listening to all nsid/d
/probe tc:/d
/tc: Using policy/d'
./stp.at:468: add_of_br 0
./stp.at:471:
    ovs-vsctl -- \
    set port br0 other_config:stp-enable=false -- \
    set bridge br0 datapath-type=dummy -- \
    set bridge br0 stp_enable=true mcast_snooping_enable=true \
    other-config:hwaddr=aa:66:aa:66:00:00 -- \
    add-br br1 -- \
    set port br1 other_config:stp-enable=false -- \
    set bridge br1 datapath-type=dummy -- \
    set bridge br1 stp_enable=true mcast_snooping_enable=true \
    other-config:hwaddr=aa:66:aa:66:00:01 -- \
    add-br br2 -- \
    set port br2 other_config:stp-enable=false -- \
    set bridge br2 datapath-type=dummy -- \
    set bridge br2 stp_enable=true mcast_snooping_enable=true \
    other-config:hwaddr=aa:66:aa:66:00:02

./stp.at:489: ovs-appctl vlog/set ofproto_dpif:dbg
./stp.at:490: ovs-appctl vlog/set ofproto_dpif_xlate:dbg
./stp.at:492: ovs-ofctl add-flow br0 action=normal
./stp.at:493: ovs-ofctl add-flow br1 action=normal
./stp.at:494: ovs-ofctl add-flow br2 action=normal
./stp.at:496:
    ovs-vsctl add-port br0 p1 -- \
        set interface p1 type=dummy
options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1
    ovs-vsctl add-port br0 p2 -- \
        set interface p2 type=dummy
options:stream=unix:$OVS_RUNDIR/p6.sock ofport_request=2
    ovs-vsctl add-port br1 p3 -- \
        set interface p3 type=dummy
options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3
    ovs-vsctl add-port br1 p4 -- \
        set interface p4 type=dummy
options:pstream=punix:$OVS_RUNDIR/p4.sock ofport_request=4
    ovs-vsctl add-port br2 p5 -- \
        set interface p5 type=dummy
options:stream=unix:$OVS_RUNDIR/p4.sock ofport_request=5
    ovs-vsctl add-port br2 p6 -- \
        set interface p6 type=dummy
options:pstream=punix:$OVS_RUNDIR/p6.sock ofport_request=6

OK
warped
./stp.at:517: cat ovs-vswitchd.log |
grep 'disabled to listening' | sed '
  s/.*ofproto_dpif|.*|port .*:/port <>:/
'
warped
./stp.at:529: cat ovs-vswitchd.log |
grep 'learning to forwarding' | sed '
  s/.*ofproto_dpif|.*|port .*:/port <>:/
'
--- - 2020-02-24 13:56:14.337746686 +0100
+++ /home/dceara/git-repos/ovs/tests/testsuite.dir/at-groups/2185/stdout
2020-02-24 13:56:14.000000000 +0100
@@ -3,4 +3,5 @@
 port <>: STP state changed from learning to forwarding
 port <>: STP state changed from learning to forwarding
 port <>: STP state changed from learning to forwarding
+port <>: STP state changed from learning to forwarding

ovsdb-server.log:
> 2020-02-24T12:56:14.033Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
> 2020-02-24T12:56:14.038Z|00002|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.13.90
ovs-vswitchd.log:
> 2020-02-24T12:56:14.059Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
> 2020-02-24T12:56:14.061Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
> 2020-02-24T12:56:14.061Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
> 2020-02-24T12:56:14.061Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connecting...
> 2020-02-24T12:56:14.061Z|00005|netlink_socket|INFO|netlink: could not enable listening to all nsid (Operation not permitted)
> 2020-02-24T12:56:14.061Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connected
> 2020-02-24T12:56:14.065Z|00007|bridge|INFO|ovs-vswitchd (Open vSwitch) 2.13.90
> 2020-02-24T12:56:14.091Z|00008|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports recirculation
> 2020-02-24T12:56:14.091Z|00009|ofproto_dpif|INFO|dummy@ovs-dummy: VLAN header stack length probed as 1
> 2020-02-24T12:56:14.091Z|00010|ofproto_dpif|INFO|dummy@ovs-dummy: MPLS label stack length probed as 3
> 2020-02-24T12:56:14.091Z|00011|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports truncate action
> 2020-02-24T12:56:14.091Z|00012|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports unique flow ids
> 2020-02-24T12:56:14.091Z|00013|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports clone action
> 2020-02-24T12:56:14.091Z|00014|ofproto_dpif|INFO|dummy@ovs-dummy: Max sample nesting level probed as 10
> 2020-02-24T12:56:14.091Z|00015|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports eventmask in conntrack action
> 2020-02-24T12:56:14.091Z|00016|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_clear action
> 2020-02-24T12:56:14.091Z|00017|ofproto_dpif|INFO|dummy@ovs-dummy: Max dp_hash algorithm probed to be 1
> 2020-02-24T12:56:14.091Z|00018|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports check_pkt_len action
> 2020-02-24T12:56:14.091Z|00019|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports timeout policy in conntrack action
> 2020-02-24T12:56:14.091Z|00020|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state
> 2020-02-24T12:56:14.091Z|00021|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_zone
> 2020-02-24T12:56:14.091Z|00022|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_mark
> 2020-02-24T12:56:14.091Z|00023|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_label
> 2020-02-24T12:56:14.091Z|00024|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state_nat
> 2020-02-24T12:56:14.091Z|00025|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple
> 2020-02-24T12:56:14.091Z|00026|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple6
> 2020-02-24T12:56:14.091Z|00027|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports IPv6 ND Extensions
> 2020-02-24T12:56:14.100Z|00028|bridge|INFO|bridge br0: added interface br0 on port 65534
> 2020-02-24T12:56:14.101Z|00029|bridge|INFO|bridge br0: using datapath ID fedcba9876543210
> 2020-02-24T12:56:14.101Z|00030|connmgr|INFO|br0: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br0.mgmt"
> 2020-02-24T12:56:14.113Z|00031|bridge|INFO|bridge br2: added interface br2 on port 65534
> 2020-02-24T12:56:14.113Z|00032|bridge|INFO|bridge br1: added interface br1 on port 65534
> 2020-02-24T12:56:14.113Z|00033|bridge|INFO|bridge br2: using datapath ID 0000aa66aa660002
> 2020-02-24T12:56:14.113Z|00034|connmgr|INFO|br2: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br2.mgmt"
> 2020-02-24T12:56:14.113Z|00035|bridge|INFO|bridge br1: using datapath ID 0000aa66aa660001
> 2020-02-24T12:56:14.113Z|00036|connmgr|INFO|br1: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br1.mgmt"
> 2020-02-24T12:56:14.120Z|00037|unixctl|DBG|received request vlog/set["ofproto_dpif:dbg"], id=0
> 2020-02-24T12:56:14.120Z|00038|unixctl|DBG|replying with success, id=0: ""
> 2020-02-24T12:56:14.124Z|00039|unixctl|DBG|received request vlog/set["ofproto_dpif_xlate:dbg"], id=0
> 2020-02-24T12:56:14.124Z|00040|unixctl|DBG|replying with success, id=0: ""
> 2020-02-24T12:56:14.129Z|00041|vconn|DBG|unix#2: sent (Success): OFPT_HELLO (OF1.5) (xid=0x1):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.129Z|00042|vconn|DBG|unix#2: received: OFPT_HELLO (xid=0x1):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.129Z|00043|vconn|DBG|unix#2: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.129Z|00044|vconn|DBG|unix#2: received: OFPST_TABLE request (xid=0x2):
> 2020-02-24T12:56:14.130Z|00045|vconn|DBG|unix#2: sent (Success): OFPST_TABLE reply (xid=0x2):
>   table 0:
>     active=0, lookup=0, matched=0
>     max_entries=1000000
>     matching:
>       exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>
>   tables 1...253: ditto
> 2020-02-24T12:56:14.130Z|00046|vconn|DBG|unix#3: sent (Success): OFPT_HELLO (OF1.5) (xid=0x2):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.130Z|00047|vconn|DBG|unix#3: received: OFPT_HELLO (xid=0x3):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.130Z|00048|vconn|DBG|unix#3: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.130Z|00049|vconn|DBG|unix#3: received: OFPT_FEATURES_REQUEST (xid=0x4):
> 2020-02-24T12:56:14.130Z|00050|vconn|DBG|unix#3: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:fedcba9876543210
> n_tables:254, n_buffers:0
> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>  LOCAL(br0): addr:aa:66:aa:66:00:00
>      config:     0
>      state:      0
>      speed: 0 Mbps now, 0 Mbps max
> 2020-02-24T12:56:14.131Z|00051|vconn|DBG|unix#4: sent (Success): OFPT_HELLO (OF1.5) (xid=0x3):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.131Z|00052|vconn|DBG|unix#4: received: OFPT_HELLO (xid=0x5):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.131Z|00053|vconn|DBG|unix#4: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.131Z|00054|vconn|DBG|unix#4: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
> 2020-02-24T12:56:14.131Z|00055|vconn|DBG|unix#4: received: OFPT_BARRIER_REQUEST (xid=0x7):
> 2020-02-24T12:56:14.131Z|00056|vconn|DBG|unix#4: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
> 2020-02-24T12:56:14.131Z|00057|connmgr|INFO|br0<->unix#4: 1 flow_mods in the last 0 s (1 adds)
> 2020-02-24T12:56:14.136Z|00058|vconn|DBG|unix#5: sent (Success): OFPT_HELLO (OF1.5) (xid=0x4):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.136Z|00059|vconn|DBG|unix#5: received: OFPT_HELLO (xid=0x1):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.136Z|00060|vconn|DBG|unix#5: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.136Z|00061|vconn|DBG|unix#5: received: OFPST_TABLE request (xid=0x2):
> 2020-02-24T12:56:14.136Z|00062|vconn|DBG|unix#5: sent (Success): OFPST_TABLE reply (xid=0x2):
>   table 0:
>     active=1, lookup=0, matched=0
>     max_entries=1000000
>     matching:
>       exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>
>   table 1:
>     active=0, lookup=0, matched=0
>     (same features)
>
>   tables 2...253: ditto
> 2020-02-24T12:56:14.137Z|00063|vconn|DBG|unix#6: sent (Success): OFPT_HELLO (OF1.5) (xid=0x5):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.137Z|00064|vconn|DBG|unix#6: received: OFPT_HELLO (xid=0x3):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.137Z|00065|vconn|DBG|unix#6: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.137Z|00066|vconn|DBG|unix#6: received: OFPT_FEATURES_REQUEST (xid=0x4):
> 2020-02-24T12:56:14.137Z|00067|vconn|DBG|unix#6: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660001
> n_tables:254, n_buffers:0
> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>  LOCAL(br1): addr:aa:66:aa:66:00:01
>      config:     0
>      state:      0
>      speed: 0 Mbps now, 0 Mbps max
> 2020-02-24T12:56:14.137Z|00068|vconn|DBG|unix#7: sent (Success): OFPT_HELLO (OF1.5) (xid=0x6):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.137Z|00069|vconn|DBG|unix#7: received: OFPT_HELLO (xid=0x5):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.137Z|00070|vconn|DBG|unix#7: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.138Z|00071|vconn|DBG|unix#7: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
> 2020-02-24T12:56:14.138Z|00072|vconn|DBG|unix#7: received: OFPT_BARRIER_REQUEST (xid=0x7):
> 2020-02-24T12:56:14.138Z|00073|vconn|DBG|unix#7: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
> 2020-02-24T12:56:14.138Z|00074|connmgr|INFO|br1<->unix#7: 1 flow_mods in the last 0 s (1 adds)
> 2020-02-24T12:56:14.143Z|00075|vconn|DBG|unix#8: sent (Success): OFPT_HELLO (OF1.5) (xid=0x7):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.143Z|00076|vconn|DBG|unix#8: received: OFPT_HELLO (xid=0x1):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.143Z|00077|vconn|DBG|unix#8: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.143Z|00078|vconn|DBG|unix#8: received: OFPST_TABLE request (xid=0x2):
> 2020-02-24T12:56:14.143Z|00079|vconn|DBG|unix#8: sent (Success): OFPST_TABLE reply (xid=0x2):
>   table 0:
>     active=1, lookup=0, matched=0
>     max_entries=1000000
>     matching:
>       exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>
>   table 1:
>     active=0, lookup=0, matched=0
>     (same features)
>
>   tables 2...253: ditto
> 2020-02-24T12:56:14.144Z|00080|vconn|DBG|unix#9: sent (Success): OFPT_HELLO (OF1.5) (xid=0x8):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.144Z|00081|vconn|DBG|unix#9: received: OFPT_HELLO (xid=0x3):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.144Z|00082|vconn|DBG|unix#9: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.144Z|00083|vconn|DBG|unix#9: received: OFPT_FEATURES_REQUEST (xid=0x4):
> 2020-02-24T12:56:14.144Z|00084|vconn|DBG|unix#9: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660002
> n_tables:254, n_buffers:0
> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>  LOCAL(br2): addr:aa:66:aa:66:00:02
>      config:     0
>      state:      0
>      speed: 0 Mbps now, 0 Mbps max
> 2020-02-24T12:56:14.144Z|00085|vconn|DBG|unix#10: sent (Success): OFPT_HELLO (OF1.5) (xid=0x9):
>  version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
> 2020-02-24T12:56:14.144Z|00086|vconn|DBG|unix#10: received: OFPT_HELLO (xid=0x5):
>  version bitmap: 0x01
> 2020-02-24T12:56:14.144Z|00087|vconn|DBG|unix#10: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
> 2020-02-24T12:56:14.145Z|00088|vconn|DBG|unix#10: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
> 2020-02-24T12:56:14.145Z|00089|vconn|DBG|unix#10: received: OFPT_BARRIER_REQUEST (xid=0x7):
> 2020-02-24T12:56:14.145Z|00090|vconn|DBG|unix#10: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
> 2020-02-24T12:56:14.145Z|00091|connmgr|INFO|br2<->unix#10: 1 flow_mods in the last 0 s (1 adds)
> 2020-02-24T12:56:14.154Z|00092|bridge|INFO|bridge br0: added interface p1 on port 1
> 2020-02-24T12:56:14.154Z|00093|ofproto_dpif|DBG|port p1: STP state changed from disabled to listening
> 2020-02-24T12:56:14.164Z|00094|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.164Z|00095|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
> 2020-02-24T12:56:14.164Z|00096|bridge|INFO|bridge br0: added interface p2 on port 2
> 2020-02-24T12:56:14.164Z|00097|ofproto_dpif|DBG|port p2: STP state changed from disabled to listening
> 2020-02-24T12:56:14.174Z|00098|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.174Z|00099|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
> 2020-02-24T12:56:14.174Z|00100|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connecting...
> 2020-02-24T12:56:14.174Z|00101|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connected
> 2020-02-24T12:56:14.174Z|00102|bridge|INFO|bridge br1: added interface p3 on port 3
> 2020-02-24T12:56:14.174Z|00103|ofproto_dpif|DBG|port p3: STP state changed from disabled to listening
> 2020-02-24T12:56:14.183Z|00104|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.183Z|00105|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
> 2020-02-24T12:56:14.184Z|00106|bridge|INFO|bridge br1: added interface p4 on port 4
> 2020-02-24T12:56:14.184Z|00107|ofproto_dpif|DBG|port p4: STP state changed from disabled to listening
> 2020-02-24T12:56:14.193Z|00108|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.193Z|00109|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
> 2020-02-24T12:56:14.193Z|00110|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connecting...
> 2020-02-24T12:56:14.193Z|00111|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connected
> 2020-02-24T12:56:14.193Z|00112|bridge|INFO|bridge br2: added interface p5 on port 5
> 2020-02-24T12:56:14.193Z|00113|ofproto_dpif|DBG|port p5: STP state changed from disabled to listening
> 2020-02-24T12:56:14.203Z|00114|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.203Z|00115|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
> 2020-02-24T12:56:14.203Z|00116|bridge|INFO|bridge br2: added interface p6 on port 6
> 2020-02-24T12:56:14.203Z|00117|ofproto_dpif|DBG|port p6: STP state changed from disabled to listening
> 2020-02-24T12:56:14.208Z|00118|unixctl|DBG|received request netdev-dummy/set-admin-state["up"], id=0
> 2020-02-24T12:56:14.208Z|00119|unixctl|DBG|replying with success, id=0: "OK"
> 2020-02-24T12:56:14.212Z|00120|unixctl|DBG|received request time/stop[], id=0
> 2020-02-24T12:56:14.212Z|00121|unixctl|DBG|replying with success, id=0: ""
> 2020-02-24T12:56:14.215Z|00122|unixctl|DBG|received request time/warp["6000","3000"], id=0
> 2020-02-24T12:56:14.225Z|00123|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
> 2020-02-24T12:56:14.225Z|00124|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connected
> 2020-02-24T12:56:14.225Z|00125|unixctl|DBG|replying with success, id=0: "warped"
> 2020-02-24T12:56:14.236Z|00126|ofproto_dpif_xlate|DBG|/proc/sys/net/core/netdev_max_backlog: using 1000 max_backlog
> 2020-02-24T12:56:14.236Z|00127|unixctl|DBG|received request time/warp["30000","3000"], id=0
> 2020-02-24T12:56:14.257Z|00128|memory|INFO|14512 kB peak resident set size after 12.2 seconds
> 2020-02-24T12:56:14.257Z|00129|memory|INFO|handlers:5 ports:9 revalidators:3 rules:17
> 2020-02-24T12:56:14.268Z|00130|ofproto_dpif|DBG|port p2: STP state changed from listening to learning
> 2020-02-24T12:56:14.268Z|00131|ofproto_dpif|DBG|port p1: STP state changed from listening to learning
> 2020-02-24T12:56:14.268Z|00132|ofproto_dpif|DBG|port p6: STP state changed from listening to learning
> 2020-02-24T12:56:14.268Z|00133|ofproto_dpif|DBG|port p5: STP state changed from listening to learning
> 2020-02-24T12:56:14.268Z|00134|ofproto_dpif|DBG|port p3: STP state changed from listening to learning
> 2020-02-24T12:56:14.268Z|00135|ofproto_dpif|DBG|port p4: STP state changed from listening to learning
> 2020-02-24T12:56:14.280Z|00001|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
> 2020-02-24T12:56:14.280Z|00002|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
> 2020-02-24T12:56:14.323Z|00136|stp|INFO|br0: detected topology change.
> 2020-02-24T12:56:14.323Z|00137|stp|INFO|br0: detected topology change.
> 2020-02-24T12:56:14.323Z|00138|ofproto_dpif|DBG|port p2: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.323Z|00139|ofproto_dpif|DBG|port p1: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.323Z|00140|stp|INFO|br2: detected topology change.
> 2020-02-24T12:56:14.323Z|00141|stp|INFO|br2: detected topology change.
> 2020-02-24T12:56:14.323Z|00142|ofproto_dpif|DBG|port p6: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.323Z|00143|ofproto_dpif|DBG|port p5: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.323Z|00144|stp|INFO|br1: detected topology change.
> 2020-02-24T12:56:14.323Z|00145|ofproto_dpif|DBG|port p3: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.323Z|00146|ofproto_dpif|DBG|port p4: STP state changed from learning to forwarding
> 2020-02-24T12:56:14.334Z|00147|unixctl|DBG|replying with success, id=0: "warped"
2185. stp.at:467: 2185. STP - flush the fdb and mdb when topology
changed (stp.at:467): FAILED (stp.at:529)
Anton Ivanov Feb. 24, 2020, 1:12 p.m. UTC | #10
On 24/02/2020 12:58, Dumitru Ceara wrote:
> On Mon, Feb 24, 2020 at 11:47 AM Anton Ivanov
> <anton.ivanov@cambridgegreys.com> wrote:
>>
>> On 19/02/2020 14:20, Dumitru Ceara wrote:
>>> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>>>
>>>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>>>> These are waited upon in the same thread where they are created. This
>>>>>> allows them to be registered persistently with the OS (if possible)
>>>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>>>> etc.
>>>>>>
>>>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>>>> which is not ready if that fd has been registered as "private" to the
>>>>>> thread which waits upon it.
>>>>>>
>>>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>>>> thread and waits upon it in others.
>>>>>>
>>>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>>>> poll++ frameworks in other OSes.
>>>>>>
>>>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>>>
>>>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>> Hi Anton,
>>>>>
>>>>> A couple of issues inline. Except for that:
>>>>>
>>>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>>>> failing with your patches applied:
>>>>>
>>>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>>>> I will have a look.
>>
>> I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?
>>
>>
> Hi Anton,
>
> After rebasing to latest upstream master I see the failure less often.
> But it's still there.

I ran it in a "until fails" loop for 2 hours this morning - not a single failure.

>
> I pasted the testsuite below.


Thanks, I will try to figure out the root cause from this.

Brgds,

A.


>
> Regards,
> Dumitru
>
> $ cat tests/testsuite.dir/2185/testsuite.log
> #                             -*- compilation -*-
> 2185. stp.at:467: testing STP - flush the fdb and mdb when topology changed ...
> ./stp.at:468: ovsdb-tool create conf.db
> $abs_top_srcdir/vswitchd/vswitch.ovsschema
> ./stp.at:468: ovsdb-server --detach --no-chdir --pidfile --log-file
> --remote=punix:$OVS_RUNDIR/db.sock
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
> ./stp.at:468: sed < stderr '
> /vlog|INFO|opened log file/d
> /ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d'
> ./stp.at:468: ovs-vsctl --no-wait init
> ./stp.at:468: ovs-vswitchd --enable-dummy --disable-system
> --disable-system-route  --detach --no-chdir --pidfile --log-file
> -vvconn -vofproto_dpif -vunixctl
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
> 2020-02-24T12:56:14Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
> 2020-02-24T12:56:14Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
> 2020-02-24T12:56:14Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connecting...
> 2020-02-24T12:56:14Z|00005|netlink_socket|INFO|netlink: could not
> enable listening to all nsid (Operation not permitted)
> 2020-02-24T12:56:14Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connected
> ./stp.at:468: sed < stderr '
> /ovs_numa|INFO|Discovered /d
> /vlog|INFO|opened log file/d
> /vswitchd|INFO|ovs-vswitchd (Open vSwitch)/d
> /reconnect|INFO|/d
> /dpif_netlink|INFO|Generic Netlink family .ovs_datapath. does not exist/d
> /ofproto|INFO|using datapath ID/d
> /netdev_linux|INFO|.*device has unknown hardware address family/d
> /ofproto|INFO|datapath ID changed to fedcba9876543210/d
> /dpdk|INFO|DPDK Disabled - Use other_config:dpdk-init to enable/d
> /netlink_socket|INFO|netlink: could not enable listening to all nsid/d
> /probe tc:/d
> /tc: Using policy/d'
> ./stp.at:468: add_of_br 0
> ./stp.at:471:
>      ovs-vsctl -- \
>      set port br0 other_config:stp-enable=false -- \
>      set bridge br0 datapath-type=dummy -- \
>      set bridge br0 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:00 -- \
>      add-br br1 -- \
>      set port br1 other_config:stp-enable=false -- \
>      set bridge br1 datapath-type=dummy -- \
>      set bridge br1 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:01 -- \
>      add-br br2 -- \
>      set port br2 other_config:stp-enable=false -- \
>      set bridge br2 datapath-type=dummy -- \
>      set bridge br2 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:02
>
> ./stp.at:489: ovs-appctl vlog/set ofproto_dpif:dbg
> ./stp.at:490: ovs-appctl vlog/set ofproto_dpif_xlate:dbg
> ./stp.at:492: ovs-ofctl add-flow br0 action=normal
> ./stp.at:493: ovs-ofctl add-flow br1 action=normal
> ./stp.at:494: ovs-ofctl add-flow br2 action=normal
> ./stp.at:496:
>      ovs-vsctl add-port br0 p1 -- \
>          set interface p1 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1
>      ovs-vsctl add-port br0 p2 -- \
>          set interface p2 type=dummy
> options:stream=unix:$OVS_RUNDIR/p6.sock ofport_request=2
>      ovs-vsctl add-port br1 p3 -- \
>          set interface p3 type=dummy
> options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3
>      ovs-vsctl add-port br1 p4 -- \
>          set interface p4 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p4.sock ofport_request=4
>      ovs-vsctl add-port br2 p5 -- \
>          set interface p5 type=dummy
> options:stream=unix:$OVS_RUNDIR/p4.sock ofport_request=5
>      ovs-vsctl add-port br2 p6 -- \
>          set interface p6 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p6.sock ofport_request=6
>
> OK
> warped
> ./stp.at:517: cat ovs-vswitchd.log |
> grep 'disabled to listening' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> warped
> ./stp.at:529: cat ovs-vswitchd.log |
> grep 'learning to forwarding' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> --- - 2020-02-24 13:56:14.337746686 +0100
> +++ /home/dceara/git-repos/ovs/tests/testsuite.dir/at-groups/2185/stdout
> 2020-02-24 13:56:14.000000000 +0100
> @@ -3,4 +3,5 @@
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
> +port <>: STP state changed from learning to forwarding
>
> ovsdb-server.log:
>> 2020-02-24T12:56:14.033Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
>> 2020-02-24T12:56:14.038Z|00002|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.13.90
> ovs-vswitchd.log:
>> 2020-02-24T12:56:14.059Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
>> 2020-02-24T12:56:14.061Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
>> 2020-02-24T12:56:14.061Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
>> 2020-02-24T12:56:14.061Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connecting...
>> 2020-02-24T12:56:14.061Z|00005|netlink_socket|INFO|netlink: could not enable listening to all nsid (Operation not permitted)
>> 2020-02-24T12:56:14.061Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connected
>> 2020-02-24T12:56:14.065Z|00007|bridge|INFO|ovs-vswitchd (Open vSwitch) 2.13.90
>> 2020-02-24T12:56:14.091Z|00008|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports recirculation
>> 2020-02-24T12:56:14.091Z|00009|ofproto_dpif|INFO|dummy@ovs-dummy: VLAN header stack length probed as 1
>> 2020-02-24T12:56:14.091Z|00010|ofproto_dpif|INFO|dummy@ovs-dummy: MPLS label stack length probed as 3
>> 2020-02-24T12:56:14.091Z|00011|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports truncate action
>> 2020-02-24T12:56:14.091Z|00012|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports unique flow ids
>> 2020-02-24T12:56:14.091Z|00013|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports clone action
>> 2020-02-24T12:56:14.091Z|00014|ofproto_dpif|INFO|dummy@ovs-dummy: Max sample nesting level probed as 10
>> 2020-02-24T12:56:14.091Z|00015|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports eventmask in conntrack action
>> 2020-02-24T12:56:14.091Z|00016|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_clear action
>> 2020-02-24T12:56:14.091Z|00017|ofproto_dpif|INFO|dummy@ovs-dummy: Max dp_hash algorithm probed to be 1
>> 2020-02-24T12:56:14.091Z|00018|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports check_pkt_len action
>> 2020-02-24T12:56:14.091Z|00019|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports timeout policy in conntrack action
>> 2020-02-24T12:56:14.091Z|00020|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state
>> 2020-02-24T12:56:14.091Z|00021|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_zone
>> 2020-02-24T12:56:14.091Z|00022|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_mark
>> 2020-02-24T12:56:14.091Z|00023|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_label
>> 2020-02-24T12:56:14.091Z|00024|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state_nat
>> 2020-02-24T12:56:14.091Z|00025|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple
>> 2020-02-24T12:56:14.091Z|00026|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple6
>> 2020-02-24T12:56:14.091Z|00027|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports IPv6 ND Extensions
>> 2020-02-24T12:56:14.100Z|00028|bridge|INFO|bridge br0: added interface br0 on port 65534
>> 2020-02-24T12:56:14.101Z|00029|bridge|INFO|bridge br0: using datapath ID fedcba9876543210
>> 2020-02-24T12:56:14.101Z|00030|connmgr|INFO|br0: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br0.mgmt"
>> 2020-02-24T12:56:14.113Z|00031|bridge|INFO|bridge br2: added interface br2 on port 65534
>> 2020-02-24T12:56:14.113Z|00032|bridge|INFO|bridge br1: added interface br1 on port 65534
>> 2020-02-24T12:56:14.113Z|00033|bridge|INFO|bridge br2: using datapath ID 0000aa66aa660002
>> 2020-02-24T12:56:14.113Z|00034|connmgr|INFO|br2: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br2.mgmt"
>> 2020-02-24T12:56:14.113Z|00035|bridge|INFO|bridge br1: using datapath ID 0000aa66aa660001
>> 2020-02-24T12:56:14.113Z|00036|connmgr|INFO|br1: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br1.mgmt"
>> 2020-02-24T12:56:14.120Z|00037|unixctl|DBG|received request vlog/set["ofproto_dpif:dbg"], id=0
>> 2020-02-24T12:56:14.120Z|00038|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.124Z|00039|unixctl|DBG|received request vlog/set["ofproto_dpif_xlate:dbg"], id=0
>> 2020-02-24T12:56:14.124Z|00040|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.129Z|00041|vconn|DBG|unix#2: sent (Success): OFPT_HELLO (OF1.5) (xid=0x1):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.129Z|00042|vconn|DBG|unix#2: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.129Z|00043|vconn|DBG|unix#2: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.129Z|00044|vconn|DBG|unix#2: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.130Z|00045|vconn|DBG|unix#2: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=0, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    tables 1...253: ditto
>> 2020-02-24T12:56:14.130Z|00046|vconn|DBG|unix#3: sent (Success): OFPT_HELLO (OF1.5) (xid=0x2):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.130Z|00047|vconn|DBG|unix#3: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.130Z|00048|vconn|DBG|unix#3: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.130Z|00049|vconn|DBG|unix#3: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.130Z|00050|vconn|DBG|unix#3: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:fedcba9876543210
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br0): addr:aa:66:aa:66:00:00
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.131Z|00051|vconn|DBG|unix#4: sent (Success): OFPT_HELLO (OF1.5) (xid=0x3):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.131Z|00052|vconn|DBG|unix#4: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.131Z|00053|vconn|DBG|unix#4: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.131Z|00054|vconn|DBG|unix#4: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.131Z|00055|vconn|DBG|unix#4: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00056|vconn|DBG|unix#4: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00057|connmgr|INFO|br0<->unix#4: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.136Z|00058|vconn|DBG|unix#5: sent (Success): OFPT_HELLO (OF1.5) (xid=0x4):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.136Z|00059|vconn|DBG|unix#5: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.136Z|00060|vconn|DBG|unix#5: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.136Z|00061|vconn|DBG|unix#5: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.136Z|00062|vconn|DBG|unix#5: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.137Z|00063|vconn|DBG|unix#6: sent (Success): OFPT_HELLO (OF1.5) (xid=0x5):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00064|vconn|DBG|unix#6: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00065|vconn|DBG|unix#6: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.137Z|00066|vconn|DBG|unix#6: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.137Z|00067|vconn|DBG|unix#6: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660001
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br1): addr:aa:66:aa:66:00:01
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.137Z|00068|vconn|DBG|unix#7: sent (Success): OFPT_HELLO (OF1.5) (xid=0x6):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00069|vconn|DBG|unix#7: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00070|vconn|DBG|unix#7: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.138Z|00071|vconn|DBG|unix#7: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.138Z|00072|vconn|DBG|unix#7: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00073|vconn|DBG|unix#7: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00074|connmgr|INFO|br1<->unix#7: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.143Z|00075|vconn|DBG|unix#8: sent (Success): OFPT_HELLO (OF1.5) (xid=0x7):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.143Z|00076|vconn|DBG|unix#8: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.143Z|00077|vconn|DBG|unix#8: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.143Z|00078|vconn|DBG|unix#8: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.143Z|00079|vconn|DBG|unix#8: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.144Z|00080|vconn|DBG|unix#9: sent (Success): OFPT_HELLO (OF1.5) (xid=0x8):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00081|vconn|DBG|unix#9: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00082|vconn|DBG|unix#9: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.144Z|00083|vconn|DBG|unix#9: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.144Z|00084|vconn|DBG|unix#9: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660002
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br2): addr:aa:66:aa:66:00:02
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.144Z|00085|vconn|DBG|unix#10: sent (Success): OFPT_HELLO (OF1.5) (xid=0x9):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00086|vconn|DBG|unix#10: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00087|vconn|DBG|unix#10: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.145Z|00088|vconn|DBG|unix#10: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.145Z|00089|vconn|DBG|unix#10: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00090|vconn|DBG|unix#10: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00091|connmgr|INFO|br2<->unix#10: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.154Z|00092|bridge|INFO|bridge br0: added interface p1 on port 1
>> 2020-02-24T12:56:14.154Z|00093|ofproto_dpif|DBG|port p1: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.164Z|00094|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.164Z|00095|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.164Z|00096|bridge|INFO|bridge br0: added interface p2 on port 2
>> 2020-02-24T12:56:14.164Z|00097|ofproto_dpif|DBG|port p2: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.174Z|00098|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00099|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.174Z|00100|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00101|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connected
>> 2020-02-24T12:56:14.174Z|00102|bridge|INFO|bridge br1: added interface p3 on port 3
>> 2020-02-24T12:56:14.174Z|00103|ofproto_dpif|DBG|port p3: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.183Z|00104|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.183Z|00105|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.184Z|00106|bridge|INFO|bridge br1: added interface p4 on port 4
>> 2020-02-24T12:56:14.184Z|00107|ofproto_dpif|DBG|port p4: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.193Z|00108|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00109|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.193Z|00110|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00111|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connected
>> 2020-02-24T12:56:14.193Z|00112|bridge|INFO|bridge br2: added interface p5 on port 5
>> 2020-02-24T12:56:14.193Z|00113|ofproto_dpif|DBG|port p5: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.203Z|00114|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.203Z|00115|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.203Z|00116|bridge|INFO|bridge br2: added interface p6 on port 6
>> 2020-02-24T12:56:14.203Z|00117|ofproto_dpif|DBG|port p6: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.208Z|00118|unixctl|DBG|received request netdev-dummy/set-admin-state["up"], id=0
>> 2020-02-24T12:56:14.208Z|00119|unixctl|DBG|replying with success, id=0: "OK"
>> 2020-02-24T12:56:14.212Z|00120|unixctl|DBG|received request time/stop[], id=0
>> 2020-02-24T12:56:14.212Z|00121|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.215Z|00122|unixctl|DBG|received request time/warp["6000","3000"], id=0
>> 2020-02-24T12:56:14.225Z|00123|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.225Z|00124|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connected
>> 2020-02-24T12:56:14.225Z|00125|unixctl|DBG|replying with success, id=0: "warped"
>> 2020-02-24T12:56:14.236Z|00126|ofproto_dpif_xlate|DBG|/proc/sys/net/core/netdev_max_backlog: using 1000 max_backlog
>> 2020-02-24T12:56:14.236Z|00127|unixctl|DBG|received request time/warp["30000","3000"], id=0
>> 2020-02-24T12:56:14.257Z|00128|memory|INFO|14512 kB peak resident set size after 12.2 seconds
>> 2020-02-24T12:56:14.257Z|00129|memory|INFO|handlers:5 ports:9 revalidators:3 rules:17
>> 2020-02-24T12:56:14.268Z|00130|ofproto_dpif|DBG|port p2: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00131|ofproto_dpif|DBG|port p1: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00132|ofproto_dpif|DBG|port p6: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00133|ofproto_dpif|DBG|port p5: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00134|ofproto_dpif|DBG|port p3: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00135|ofproto_dpif|DBG|port p4: STP state changed from listening to learning
>> 2020-02-24T12:56:14.280Z|00001|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.280Z|00002|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.323Z|00136|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00137|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00138|ofproto_dpif|DBG|port p2: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00139|ofproto_dpif|DBG|port p1: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00140|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00141|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00142|ofproto_dpif|DBG|port p6: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00143|ofproto_dpif|DBG|port p5: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00144|stp|INFO|br1: detected topology change.
>> 2020-02-24T12:56:14.323Z|00145|ofproto_dpif|DBG|port p3: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00146|ofproto_dpif|DBG|port p4: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.334Z|00147|unixctl|DBG|replying with success, id=0: "warped"
> 2185. stp.at:467: 2185. STP - flush the fdb and mdb when topology
> changed (stp.at:467): FAILED (stp.at:529)
>
>
Anton Ivanov Feb. 24, 2020, 2:17 p.m. UTC | #11
I have it reproduced now.

It is a race of some sort which is caused (or unmasked) by the epoll changes.

A (relatively) slow CPU (f.e. 4 core A10 at 3.5GHz) has zero failures.

A fast CPU (f.e. 6 core/12 thread Ryzen at 4GHz) fails nearly every time.

If it is (e)poll related - I will figure it out and update the patches. If it is something inherent which is presently being masked I will send the fixes separately.

A.


On 24/02/2020 12:58, Dumitru Ceara wrote:
> On Mon, Feb 24, 2020 at 11:47 AM Anton Ivanov
> <anton.ivanov@cambridgegreys.com> wrote:
>>
>>
>> On 19/02/2020 14:20, Dumitru Ceara wrote:
>>> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>>>
>>>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>>>
>>>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>>>> These are waited upon in the same thread where they are created. This
>>>>>> allows them to be registered persistently with the OS (if possible)
>>>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>>>> etc.
>>>>>>
>>>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>>>> which is not ready if that fd has been registered as "private" to the
>>>>>> thread which waits upon it.
>>>>>>
>>>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>>>> thread and waits upon it in others.
>>>>>>
>>>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>>>> poll++ frameworks in other OSes.
>>>>>>
>>>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>>>
>>>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>> Hi Anton,
>>>>>
>>>>> A couple of issues inline. Except for that:
>>>>>
>>>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>>>> failing with your patches applied:
>>>>>
>>>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>>>> I will have a look.
>>
>>
>> I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?
>>
>>
> 
> Hi Anton,
> 
> After rebasing to latest upstream master I see the failure less often.
> But it's still there.
> 
> I pasted the testsuite below.
> 
> Regards,
> Dumitru
> 
> $ cat tests/testsuite.dir/2185/testsuite.log
> #                             -*- compilation -*-
> 2185. stp.at:467: testing STP - flush the fdb and mdb when topology changed ...
> ./stp.at:468: ovsdb-tool create conf.db
> $abs_top_srcdir/vswitchd/vswitch.ovsschema
> ./stp.at:468: ovsdb-server --detach --no-chdir --pidfile --log-file
> --remote=punix:$OVS_RUNDIR/db.sock
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
> ./stp.at:468: sed < stderr '
> /vlog|INFO|opened log file/d
> /ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d'
> ./stp.at:468: ovs-vsctl --no-wait init
> ./stp.at:468: ovs-vswitchd --enable-dummy --disable-system
> --disable-system-route  --detach --no-chdir --pidfile --log-file
> -vvconn -vofproto_dpif -vunixctl
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
> 2020-02-24T12:56:14Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
> 2020-02-24T12:56:14Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
> 2020-02-24T12:56:14Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connecting...
> 2020-02-24T12:56:14Z|00005|netlink_socket|INFO|netlink: could not
> enable listening to all nsid (Operation not permitted)
> 2020-02-24T12:56:14Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connected
> ./stp.at:468: sed < stderr '
> /ovs_numa|INFO|Discovered /d
> /vlog|INFO|opened log file/d
> /vswitchd|INFO|ovs-vswitchd (Open vSwitch)/d
> /reconnect|INFO|/d
> /dpif_netlink|INFO|Generic Netlink family .ovs_datapath. does not exist/d
> /ofproto|INFO|using datapath ID/d
> /netdev_linux|INFO|.*device has unknown hardware address family/d
> /ofproto|INFO|datapath ID changed to fedcba9876543210/d
> /dpdk|INFO|DPDK Disabled - Use other_config:dpdk-init to enable/d
> /netlink_socket|INFO|netlink: could not enable listening to all nsid/d
> /probe tc:/d
> /tc: Using policy/d'
> ./stp.at:468: add_of_br 0
> ./stp.at:471:
>      ovs-vsctl -- \
>      set port br0 other_config:stp-enable=false -- \
>      set bridge br0 datapath-type=dummy -- \
>      set bridge br0 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:00 -- \
>      add-br br1 -- \
>      set port br1 other_config:stp-enable=false -- \
>      set bridge br1 datapath-type=dummy -- \
>      set bridge br1 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:01 -- \
>      add-br br2 -- \
>      set port br2 other_config:stp-enable=false -- \
>      set bridge br2 datapath-type=dummy -- \
>      set bridge br2 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:02
> 
> ./stp.at:489: ovs-appctl vlog/set ofproto_dpif:dbg
> ./stp.at:490: ovs-appctl vlog/set ofproto_dpif_xlate:dbg
> ./stp.at:492: ovs-ofctl add-flow br0 action=normal
> ./stp.at:493: ovs-ofctl add-flow br1 action=normal
> ./stp.at:494: ovs-ofctl add-flow br2 action=normal
> ./stp.at:496:
>      ovs-vsctl add-port br0 p1 -- \
>          set interface p1 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1
>      ovs-vsctl add-port br0 p2 -- \
>          set interface p2 type=dummy
> options:stream=unix:$OVS_RUNDIR/p6.sock ofport_request=2
>      ovs-vsctl add-port br1 p3 -- \
>          set interface p3 type=dummy
> options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3
>      ovs-vsctl add-port br1 p4 -- \
>          set interface p4 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p4.sock ofport_request=4
>      ovs-vsctl add-port br2 p5 -- \
>          set interface p5 type=dummy
> options:stream=unix:$OVS_RUNDIR/p4.sock ofport_request=5
>      ovs-vsctl add-port br2 p6 -- \
>          set interface p6 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p6.sock ofport_request=6
> 
> OK
> warped
> ./stp.at:517: cat ovs-vswitchd.log |
> grep 'disabled to listening' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> warped
> ./stp.at:529: cat ovs-vswitchd.log |
> grep 'learning to forwarding' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> --- - 2020-02-24 13:56:14.337746686 +0100
> +++ /home/dceara/git-repos/ovs/tests/testsuite.dir/at-groups/2185/stdout
> 2020-02-24 13:56:14.000000000 +0100
> @@ -3,4 +3,5 @@
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
> +port <>: STP state changed from learning to forwarding
> 
> ovsdb-server.log:
>> 2020-02-24T12:56:14.033Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
>> 2020-02-24T12:56:14.038Z|00002|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.13.90
> ovs-vswitchd.log:
>> 2020-02-24T12:56:14.059Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
>> 2020-02-24T12:56:14.061Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
>> 2020-02-24T12:56:14.061Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
>> 2020-02-24T12:56:14.061Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connecting...
>> 2020-02-24T12:56:14.061Z|00005|netlink_socket|INFO|netlink: could not enable listening to all nsid (Operation not permitted)
>> 2020-02-24T12:56:14.061Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connected
>> 2020-02-24T12:56:14.065Z|00007|bridge|INFO|ovs-vswitchd (Open vSwitch) 2.13.90
>> 2020-02-24T12:56:14.091Z|00008|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports recirculation
>> 2020-02-24T12:56:14.091Z|00009|ofproto_dpif|INFO|dummy@ovs-dummy: VLAN header stack length probed as 1
>> 2020-02-24T12:56:14.091Z|00010|ofproto_dpif|INFO|dummy@ovs-dummy: MPLS label stack length probed as 3
>> 2020-02-24T12:56:14.091Z|00011|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports truncate action
>> 2020-02-24T12:56:14.091Z|00012|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports unique flow ids
>> 2020-02-24T12:56:14.091Z|00013|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports clone action
>> 2020-02-24T12:56:14.091Z|00014|ofproto_dpif|INFO|dummy@ovs-dummy: Max sample nesting level probed as 10
>> 2020-02-24T12:56:14.091Z|00015|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports eventmask in conntrack action
>> 2020-02-24T12:56:14.091Z|00016|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_clear action
>> 2020-02-24T12:56:14.091Z|00017|ofproto_dpif|INFO|dummy@ovs-dummy: Max dp_hash algorithm probed to be 1
>> 2020-02-24T12:56:14.091Z|00018|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports check_pkt_len action
>> 2020-02-24T12:56:14.091Z|00019|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports timeout policy in conntrack action
>> 2020-02-24T12:56:14.091Z|00020|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state
>> 2020-02-24T12:56:14.091Z|00021|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_zone
>> 2020-02-24T12:56:14.091Z|00022|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_mark
>> 2020-02-24T12:56:14.091Z|00023|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_label
>> 2020-02-24T12:56:14.091Z|00024|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state_nat
>> 2020-02-24T12:56:14.091Z|00025|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple
>> 2020-02-24T12:56:14.091Z|00026|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple6
>> 2020-02-24T12:56:14.091Z|00027|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports IPv6 ND Extensions
>> 2020-02-24T12:56:14.100Z|00028|bridge|INFO|bridge br0: added interface br0 on port 65534
>> 2020-02-24T12:56:14.101Z|00029|bridge|INFO|bridge br0: using datapath ID fedcba9876543210
>> 2020-02-24T12:56:14.101Z|00030|connmgr|INFO|br0: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br0.mgmt"
>> 2020-02-24T12:56:14.113Z|00031|bridge|INFO|bridge br2: added interface br2 on port 65534
>> 2020-02-24T12:56:14.113Z|00032|bridge|INFO|bridge br1: added interface br1 on port 65534
>> 2020-02-24T12:56:14.113Z|00033|bridge|INFO|bridge br2: using datapath ID 0000aa66aa660002
>> 2020-02-24T12:56:14.113Z|00034|connmgr|INFO|br2: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br2.mgmt"
>> 2020-02-24T12:56:14.113Z|00035|bridge|INFO|bridge br1: using datapath ID 0000aa66aa660001
>> 2020-02-24T12:56:14.113Z|00036|connmgr|INFO|br1: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br1.mgmt"
>> 2020-02-24T12:56:14.120Z|00037|unixctl|DBG|received request vlog/set["ofproto_dpif:dbg"], id=0
>> 2020-02-24T12:56:14.120Z|00038|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.124Z|00039|unixctl|DBG|received request vlog/set["ofproto_dpif_xlate:dbg"], id=0
>> 2020-02-24T12:56:14.124Z|00040|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.129Z|00041|vconn|DBG|unix#2: sent (Success): OFPT_HELLO (OF1.5) (xid=0x1):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.129Z|00042|vconn|DBG|unix#2: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.129Z|00043|vconn|DBG|unix#2: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.129Z|00044|vconn|DBG|unix#2: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.130Z|00045|vconn|DBG|unix#2: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=0, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    tables 1...253: ditto
>> 2020-02-24T12:56:14.130Z|00046|vconn|DBG|unix#3: sent (Success): OFPT_HELLO (OF1.5) (xid=0x2):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.130Z|00047|vconn|DBG|unix#3: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.130Z|00048|vconn|DBG|unix#3: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.130Z|00049|vconn|DBG|unix#3: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.130Z|00050|vconn|DBG|unix#3: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:fedcba9876543210
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br0): addr:aa:66:aa:66:00:00
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.131Z|00051|vconn|DBG|unix#4: sent (Success): OFPT_HELLO (OF1.5) (xid=0x3):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.131Z|00052|vconn|DBG|unix#4: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.131Z|00053|vconn|DBG|unix#4: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.131Z|00054|vconn|DBG|unix#4: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.131Z|00055|vconn|DBG|unix#4: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00056|vconn|DBG|unix#4: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00057|connmgr|INFO|br0<->unix#4: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.136Z|00058|vconn|DBG|unix#5: sent (Success): OFPT_HELLO (OF1.5) (xid=0x4):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.136Z|00059|vconn|DBG|unix#5: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.136Z|00060|vconn|DBG|unix#5: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.136Z|00061|vconn|DBG|unix#5: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.136Z|00062|vconn|DBG|unix#5: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.137Z|00063|vconn|DBG|unix#6: sent (Success): OFPT_HELLO (OF1.5) (xid=0x5):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00064|vconn|DBG|unix#6: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00065|vconn|DBG|unix#6: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.137Z|00066|vconn|DBG|unix#6: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.137Z|00067|vconn|DBG|unix#6: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660001
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br1): addr:aa:66:aa:66:00:01
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.137Z|00068|vconn|DBG|unix#7: sent (Success): OFPT_HELLO (OF1.5) (xid=0x6):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00069|vconn|DBG|unix#7: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00070|vconn|DBG|unix#7: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.138Z|00071|vconn|DBG|unix#7: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.138Z|00072|vconn|DBG|unix#7: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00073|vconn|DBG|unix#7: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00074|connmgr|INFO|br1<->unix#7: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.143Z|00075|vconn|DBG|unix#8: sent (Success): OFPT_HELLO (OF1.5) (xid=0x7):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.143Z|00076|vconn|DBG|unix#8: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.143Z|00077|vconn|DBG|unix#8: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.143Z|00078|vconn|DBG|unix#8: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.143Z|00079|vconn|DBG|unix#8: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.144Z|00080|vconn|DBG|unix#9: sent (Success): OFPT_HELLO (OF1.5) (xid=0x8):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00081|vconn|DBG|unix#9: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00082|vconn|DBG|unix#9: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.144Z|00083|vconn|DBG|unix#9: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.144Z|00084|vconn|DBG|unix#9: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660002
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br2): addr:aa:66:aa:66:00:02
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.144Z|00085|vconn|DBG|unix#10: sent (Success): OFPT_HELLO (OF1.5) (xid=0x9):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00086|vconn|DBG|unix#10: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00087|vconn|DBG|unix#10: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.145Z|00088|vconn|DBG|unix#10: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.145Z|00089|vconn|DBG|unix#10: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00090|vconn|DBG|unix#10: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00091|connmgr|INFO|br2<->unix#10: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.154Z|00092|bridge|INFO|bridge br0: added interface p1 on port 1
>> 2020-02-24T12:56:14.154Z|00093|ofproto_dpif|DBG|port p1: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.164Z|00094|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.164Z|00095|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.164Z|00096|bridge|INFO|bridge br0: added interface p2 on port 2
>> 2020-02-24T12:56:14.164Z|00097|ofproto_dpif|DBG|port p2: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.174Z|00098|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00099|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.174Z|00100|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00101|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connected
>> 2020-02-24T12:56:14.174Z|00102|bridge|INFO|bridge br1: added interface p3 on port 3
>> 2020-02-24T12:56:14.174Z|00103|ofproto_dpif|DBG|port p3: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.183Z|00104|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.183Z|00105|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.184Z|00106|bridge|INFO|bridge br1: added interface p4 on port 4
>> 2020-02-24T12:56:14.184Z|00107|ofproto_dpif|DBG|port p4: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.193Z|00108|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00109|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.193Z|00110|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00111|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connected
>> 2020-02-24T12:56:14.193Z|00112|bridge|INFO|bridge br2: added interface p5 on port 5
>> 2020-02-24T12:56:14.193Z|00113|ofproto_dpif|DBG|port p5: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.203Z|00114|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.203Z|00115|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.203Z|00116|bridge|INFO|bridge br2: added interface p6 on port 6
>> 2020-02-24T12:56:14.203Z|00117|ofproto_dpif|DBG|port p6: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.208Z|00118|unixctl|DBG|received request netdev-dummy/set-admin-state["up"], id=0
>> 2020-02-24T12:56:14.208Z|00119|unixctl|DBG|replying with success, id=0: "OK"
>> 2020-02-24T12:56:14.212Z|00120|unixctl|DBG|received request time/stop[], id=0
>> 2020-02-24T12:56:14.212Z|00121|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.215Z|00122|unixctl|DBG|received request time/warp["6000","3000"], id=0
>> 2020-02-24T12:56:14.225Z|00123|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.225Z|00124|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connected
>> 2020-02-24T12:56:14.225Z|00125|unixctl|DBG|replying with success, id=0: "warped"
>> 2020-02-24T12:56:14.236Z|00126|ofproto_dpif_xlate|DBG|/proc/sys/net/core/netdev_max_backlog: using 1000 max_backlog
>> 2020-02-24T12:56:14.236Z|00127|unixctl|DBG|received request time/warp["30000","3000"], id=0
>> 2020-02-24T12:56:14.257Z|00128|memory|INFO|14512 kB peak resident set size after 12.2 seconds
>> 2020-02-24T12:56:14.257Z|00129|memory|INFO|handlers:5 ports:9 revalidators:3 rules:17
>> 2020-02-24T12:56:14.268Z|00130|ofproto_dpif|DBG|port p2: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00131|ofproto_dpif|DBG|port p1: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00132|ofproto_dpif|DBG|port p6: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00133|ofproto_dpif|DBG|port p5: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00134|ofproto_dpif|DBG|port p3: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00135|ofproto_dpif|DBG|port p4: STP state changed from listening to learning
>> 2020-02-24T12:56:14.280Z|00001|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.280Z|00002|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.323Z|00136|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00137|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00138|ofproto_dpif|DBG|port p2: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00139|ofproto_dpif|DBG|port p1: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00140|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00141|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00142|ofproto_dpif|DBG|port p6: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00143|ofproto_dpif|DBG|port p5: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00144|stp|INFO|br1: detected topology change.
>> 2020-02-24T12:56:14.323Z|00145|ofproto_dpif|DBG|port p3: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00146|ofproto_dpif|DBG|port p4: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.334Z|00147|unixctl|DBG|replying with success, id=0: "warped"
> 2185. stp.at:467: 2185. STP - flush the fdb and mdb when topology
> changed (stp.at:467): FAILED (stp.at:529)
> 
>
Anton Ivanov Feb. 24, 2020, 2:59 p.m. UTC | #12
This is not an stp failure, it is stp showing an underlying netdev-dummy issue similar to the netlink one earlier on.

Brgds,

A.

On 24/02/2020 12:58, Dumitru Ceara wrote:
> On Mon, Feb 24, 2020 at 11:47 AM Anton Ivanov
> <anton.ivanov@cambridgegreys.com> wrote:
>>
>>
>> On 19/02/2020 14:20, Dumitru Ceara wrote:
>>> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>>>
>>>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>>>
>>>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>>>> These are waited upon in the same thread where they are created. This
>>>>>> allows them to be registered persistently with the OS (if possible)
>>>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>>>> etc.
>>>>>>
>>>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>>>> which is not ready if that fd has been registered as "private" to the
>>>>>> thread which waits upon it.
>>>>>>
>>>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>>>> thread and waits upon it in others.
>>>>>>
>>>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>>>> poll++ frameworks in other OSes.
>>>>>>
>>>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>>>
>>>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>> Hi Anton,
>>>>>
>>>>> A couple of issues inline. Except for that:
>>>>>
>>>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>>>> failing with your patches applied:
>>>>>
>>>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>>>> I will have a look.
>>
>>
>> I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?
>>
>>
> 
> Hi Anton,
> 
> After rebasing to latest upstream master I see the failure less often.
> But it's still there.
> 
> I pasted the testsuite below.
> 
> Regards,
> Dumitru
> 
> $ cat tests/testsuite.dir/2185/testsuite.log
> #                             -*- compilation -*-
> 2185. stp.at:467: testing STP - flush the fdb and mdb when topology changed ...
> ./stp.at:468: ovsdb-tool create conf.db
> $abs_top_srcdir/vswitchd/vswitch.ovsschema
> ./stp.at:468: ovsdb-server --detach --no-chdir --pidfile --log-file
> --remote=punix:$OVS_RUNDIR/db.sock
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
> ./stp.at:468: sed < stderr '
> /vlog|INFO|opened log file/d
> /ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d'
> ./stp.at:468: ovs-vsctl --no-wait init
> ./stp.at:468: ovs-vswitchd --enable-dummy --disable-system
> --disable-system-route  --detach --no-chdir --pidfile --log-file
> -vvconn -vofproto_dpif -vunixctl
> stderr:
> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
> 2020-02-24T12:56:14Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
> 2020-02-24T12:56:14Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
> 2020-02-24T12:56:14Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connecting...
> 2020-02-24T12:56:14Z|00005|netlink_socket|INFO|netlink: could not
> enable listening to all nsid (Operation not permitted)
> 2020-02-24T12:56:14Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
> connected
> ./stp.at:468: sed < stderr '
> /ovs_numa|INFO|Discovered /d
> /vlog|INFO|opened log file/d
> /vswitchd|INFO|ovs-vswitchd (Open vSwitch)/d
> /reconnect|INFO|/d
> /dpif_netlink|INFO|Generic Netlink family .ovs_datapath. does not exist/d
> /ofproto|INFO|using datapath ID/d
> /netdev_linux|INFO|.*device has unknown hardware address family/d
> /ofproto|INFO|datapath ID changed to fedcba9876543210/d
> /dpdk|INFO|DPDK Disabled - Use other_config:dpdk-init to enable/d
> /netlink_socket|INFO|netlink: could not enable listening to all nsid/d
> /probe tc:/d
> /tc: Using policy/d'
> ./stp.at:468: add_of_br 0
> ./stp.at:471:
>      ovs-vsctl -- \
>      set port br0 other_config:stp-enable=false -- \
>      set bridge br0 datapath-type=dummy -- \
>      set bridge br0 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:00 -- \
>      add-br br1 -- \
>      set port br1 other_config:stp-enable=false -- \
>      set bridge br1 datapath-type=dummy -- \
>      set bridge br1 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:01 -- \
>      add-br br2 -- \
>      set port br2 other_config:stp-enable=false -- \
>      set bridge br2 datapath-type=dummy -- \
>      set bridge br2 stp_enable=true mcast_snooping_enable=true \
>      other-config:hwaddr=aa:66:aa:66:00:02
> 
> ./stp.at:489: ovs-appctl vlog/set ofproto_dpif:dbg
> ./stp.at:490: ovs-appctl vlog/set ofproto_dpif_xlate:dbg
> ./stp.at:492: ovs-ofctl add-flow br0 action=normal
> ./stp.at:493: ovs-ofctl add-flow br1 action=normal
> ./stp.at:494: ovs-ofctl add-flow br2 action=normal
> ./stp.at:496:
>      ovs-vsctl add-port br0 p1 -- \
>          set interface p1 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1
>      ovs-vsctl add-port br0 p2 -- \
>          set interface p2 type=dummy
> options:stream=unix:$OVS_RUNDIR/p6.sock ofport_request=2
>      ovs-vsctl add-port br1 p3 -- \
>          set interface p3 type=dummy
> options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3
>      ovs-vsctl add-port br1 p4 -- \
>          set interface p4 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p4.sock ofport_request=4
>      ovs-vsctl add-port br2 p5 -- \
>          set interface p5 type=dummy
> options:stream=unix:$OVS_RUNDIR/p4.sock ofport_request=5
>      ovs-vsctl add-port br2 p6 -- \
>          set interface p6 type=dummy
> options:pstream=punix:$OVS_RUNDIR/p6.sock ofport_request=6
> 
> OK
> warped
> ./stp.at:517: cat ovs-vswitchd.log |
> grep 'disabled to listening' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> warped
> ./stp.at:529: cat ovs-vswitchd.log |
> grep 'learning to forwarding' | sed '
>    s/.*ofproto_dpif|.*|port .*:/port <>:/
> '
> --- - 2020-02-24 13:56:14.337746686 +0100
> +++ /home/dceara/git-repos/ovs/tests/testsuite.dir/at-groups/2185/stdout
> 2020-02-24 13:56:14.000000000 +0100
> @@ -3,4 +3,5 @@
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
>   port <>: STP state changed from learning to forwarding
> +port <>: STP state changed from learning to forwarding
> 
> ovsdb-server.log:
>> 2020-02-24T12:56:14.033Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
>> 2020-02-24T12:56:14.038Z|00002|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.13.90
> ovs-vswitchd.log:
>> 2020-02-24T12:56:14.059Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
>> 2020-02-24T12:56:14.061Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
>> 2020-02-24T12:56:14.061Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
>> 2020-02-24T12:56:14.061Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connecting...
>> 2020-02-24T12:56:14.061Z|00005|netlink_socket|INFO|netlink: could not enable listening to all nsid (Operation not permitted)
>> 2020-02-24T12:56:14.061Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connected
>> 2020-02-24T12:56:14.065Z|00007|bridge|INFO|ovs-vswitchd (Open vSwitch) 2.13.90
>> 2020-02-24T12:56:14.091Z|00008|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports recirculation
>> 2020-02-24T12:56:14.091Z|00009|ofproto_dpif|INFO|dummy@ovs-dummy: VLAN header stack length probed as 1
>> 2020-02-24T12:56:14.091Z|00010|ofproto_dpif|INFO|dummy@ovs-dummy: MPLS label stack length probed as 3
>> 2020-02-24T12:56:14.091Z|00011|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports truncate action
>> 2020-02-24T12:56:14.091Z|00012|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports unique flow ids
>> 2020-02-24T12:56:14.091Z|00013|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports clone action
>> 2020-02-24T12:56:14.091Z|00014|ofproto_dpif|INFO|dummy@ovs-dummy: Max sample nesting level probed as 10
>> 2020-02-24T12:56:14.091Z|00015|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports eventmask in conntrack action
>> 2020-02-24T12:56:14.091Z|00016|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_clear action
>> 2020-02-24T12:56:14.091Z|00017|ofproto_dpif|INFO|dummy@ovs-dummy: Max dp_hash algorithm probed to be 1
>> 2020-02-24T12:56:14.091Z|00018|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports check_pkt_len action
>> 2020-02-24T12:56:14.091Z|00019|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports timeout policy in conntrack action
>> 2020-02-24T12:56:14.091Z|00020|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state
>> 2020-02-24T12:56:14.091Z|00021|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_zone
>> 2020-02-24T12:56:14.091Z|00022|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_mark
>> 2020-02-24T12:56:14.091Z|00023|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_label
>> 2020-02-24T12:56:14.091Z|00024|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state_nat
>> 2020-02-24T12:56:14.091Z|00025|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple
>> 2020-02-24T12:56:14.091Z|00026|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple6
>> 2020-02-24T12:56:14.091Z|00027|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports IPv6 ND Extensions
>> 2020-02-24T12:56:14.100Z|00028|bridge|INFO|bridge br0: added interface br0 on port 65534
>> 2020-02-24T12:56:14.101Z|00029|bridge|INFO|bridge br0: using datapath ID fedcba9876543210
>> 2020-02-24T12:56:14.101Z|00030|connmgr|INFO|br0: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br0.mgmt"
>> 2020-02-24T12:56:14.113Z|00031|bridge|INFO|bridge br2: added interface br2 on port 65534
>> 2020-02-24T12:56:14.113Z|00032|bridge|INFO|bridge br1: added interface br1 on port 65534
>> 2020-02-24T12:56:14.113Z|00033|bridge|INFO|bridge br2: using datapath ID 0000aa66aa660002
>> 2020-02-24T12:56:14.113Z|00034|connmgr|INFO|br2: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br2.mgmt"
>> 2020-02-24T12:56:14.113Z|00035|bridge|INFO|bridge br1: using datapath ID 0000aa66aa660001
>> 2020-02-24T12:56:14.113Z|00036|connmgr|INFO|br1: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br1.mgmt"
>> 2020-02-24T12:56:14.120Z|00037|unixctl|DBG|received request vlog/set["ofproto_dpif:dbg"], id=0
>> 2020-02-24T12:56:14.120Z|00038|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.124Z|00039|unixctl|DBG|received request vlog/set["ofproto_dpif_xlate:dbg"], id=0
>> 2020-02-24T12:56:14.124Z|00040|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.129Z|00041|vconn|DBG|unix#2: sent (Success): OFPT_HELLO (OF1.5) (xid=0x1):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.129Z|00042|vconn|DBG|unix#2: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.129Z|00043|vconn|DBG|unix#2: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.129Z|00044|vconn|DBG|unix#2: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.130Z|00045|vconn|DBG|unix#2: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=0, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    tables 1...253: ditto
>> 2020-02-24T12:56:14.130Z|00046|vconn|DBG|unix#3: sent (Success): OFPT_HELLO (OF1.5) (xid=0x2):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.130Z|00047|vconn|DBG|unix#3: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.130Z|00048|vconn|DBG|unix#3: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.130Z|00049|vconn|DBG|unix#3: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.130Z|00050|vconn|DBG|unix#3: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:fedcba9876543210
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br0): addr:aa:66:aa:66:00:00
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.131Z|00051|vconn|DBG|unix#4: sent (Success): OFPT_HELLO (OF1.5) (xid=0x3):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.131Z|00052|vconn|DBG|unix#4: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.131Z|00053|vconn|DBG|unix#4: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.131Z|00054|vconn|DBG|unix#4: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.131Z|00055|vconn|DBG|unix#4: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00056|vconn|DBG|unix#4: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.131Z|00057|connmgr|INFO|br0<->unix#4: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.136Z|00058|vconn|DBG|unix#5: sent (Success): OFPT_HELLO (OF1.5) (xid=0x4):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.136Z|00059|vconn|DBG|unix#5: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.136Z|00060|vconn|DBG|unix#5: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.136Z|00061|vconn|DBG|unix#5: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.136Z|00062|vconn|DBG|unix#5: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.137Z|00063|vconn|DBG|unix#6: sent (Success): OFPT_HELLO (OF1.5) (xid=0x5):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00064|vconn|DBG|unix#6: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00065|vconn|DBG|unix#6: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.137Z|00066|vconn|DBG|unix#6: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.137Z|00067|vconn|DBG|unix#6: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660001
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br1): addr:aa:66:aa:66:00:01
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.137Z|00068|vconn|DBG|unix#7: sent (Success): OFPT_HELLO (OF1.5) (xid=0x6):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.137Z|00069|vconn|DBG|unix#7: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.137Z|00070|vconn|DBG|unix#7: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.138Z|00071|vconn|DBG|unix#7: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.138Z|00072|vconn|DBG|unix#7: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00073|vconn|DBG|unix#7: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.138Z|00074|connmgr|INFO|br1<->unix#7: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.143Z|00075|vconn|DBG|unix#8: sent (Success): OFPT_HELLO (OF1.5) (xid=0x7):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.143Z|00076|vconn|DBG|unix#8: received: OFPT_HELLO (xid=0x1):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.143Z|00077|vconn|DBG|unix#8: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.143Z|00078|vconn|DBG|unix#8: received: OFPST_TABLE request (xid=0x2):
>> 2020-02-24T12:56:14.143Z|00079|vconn|DBG|unix#8: sent (Success): OFPST_TABLE reply (xid=0x2):
>>    table 0:
>>      active=1, lookup=0, matched=0
>>      max_entries=1000000
>>      matching:
>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>
>>    table 1:
>>      active=0, lookup=0, matched=0
>>      (same features)
>>
>>    tables 2...253: ditto
>> 2020-02-24T12:56:14.144Z|00080|vconn|DBG|unix#9: sent (Success): OFPT_HELLO (OF1.5) (xid=0x8):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00081|vconn|DBG|unix#9: received: OFPT_HELLO (xid=0x3):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00082|vconn|DBG|unix#9: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.144Z|00083|vconn|DBG|unix#9: received: OFPT_FEATURES_REQUEST (xid=0x4):
>> 2020-02-24T12:56:14.144Z|00084|vconn|DBG|unix#9: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660002
>> n_tables:254, n_buffers:0
>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>   LOCAL(br2): addr:aa:66:aa:66:00:02
>>       config:     0
>>       state:      0
>>       speed: 0 Mbps now, 0 Mbps max
>> 2020-02-24T12:56:14.144Z|00085|vconn|DBG|unix#10: sent (Success): OFPT_HELLO (OF1.5) (xid=0x9):
>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>> 2020-02-24T12:56:14.144Z|00086|vconn|DBG|unix#10: received: OFPT_HELLO (xid=0x5):
>>   version bitmap: 0x01
>> 2020-02-24T12:56:14.144Z|00087|vconn|DBG|unix#10: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>> 2020-02-24T12:56:14.145Z|00088|vconn|DBG|unix#10: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>> 2020-02-24T12:56:14.145Z|00089|vconn|DBG|unix#10: received: OFPT_BARRIER_REQUEST (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00090|vconn|DBG|unix#10: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>> 2020-02-24T12:56:14.145Z|00091|connmgr|INFO|br2<->unix#10: 1 flow_mods in the last 0 s (1 adds)
>> 2020-02-24T12:56:14.154Z|00092|bridge|INFO|bridge br0: added interface p1 on port 1
>> 2020-02-24T12:56:14.154Z|00093|ofproto_dpif|DBG|port p1: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.164Z|00094|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.164Z|00095|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.164Z|00096|bridge|INFO|bridge br0: added interface p2 on port 2
>> 2020-02-24T12:56:14.164Z|00097|ofproto_dpif|DBG|port p2: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.174Z|00098|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00099|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.174Z|00100|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connecting...
>> 2020-02-24T12:56:14.174Z|00101|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connected
>> 2020-02-24T12:56:14.174Z|00102|bridge|INFO|bridge br1: added interface p3 on port 3
>> 2020-02-24T12:56:14.174Z|00103|ofproto_dpif|DBG|port p3: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.183Z|00104|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.183Z|00105|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.184Z|00106|bridge|INFO|bridge br1: added interface p4 on port 4
>> 2020-02-24T12:56:14.184Z|00107|ofproto_dpif|DBG|port p4: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.193Z|00108|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00109|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.193Z|00110|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connecting...
>> 2020-02-24T12:56:14.193Z|00111|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connected
>> 2020-02-24T12:56:14.193Z|00112|bridge|INFO|bridge br2: added interface p5 on port 5
>> 2020-02-24T12:56:14.193Z|00113|ofproto_dpif|DBG|port p5: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.203Z|00114|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.203Z|00115|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>> 2020-02-24T12:56:14.203Z|00116|bridge|INFO|bridge br2: added interface p6 on port 6
>> 2020-02-24T12:56:14.203Z|00117|ofproto_dpif|DBG|port p6: STP state changed from disabled to listening
>> 2020-02-24T12:56:14.208Z|00118|unixctl|DBG|received request netdev-dummy/set-admin-state["up"], id=0
>> 2020-02-24T12:56:14.208Z|00119|unixctl|DBG|replying with success, id=0: "OK"
>> 2020-02-24T12:56:14.212Z|00120|unixctl|DBG|received request time/stop[], id=0
>> 2020-02-24T12:56:14.212Z|00121|unixctl|DBG|replying with success, id=0: ""
>> 2020-02-24T12:56:14.215Z|00122|unixctl|DBG|received request time/warp["6000","3000"], id=0
>> 2020-02-24T12:56:14.225Z|00123|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>> 2020-02-24T12:56:14.225Z|00124|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connected
>> 2020-02-24T12:56:14.225Z|00125|unixctl|DBG|replying with success, id=0: "warped"
>> 2020-02-24T12:56:14.236Z|00126|ofproto_dpif_xlate|DBG|/proc/sys/net/core/netdev_max_backlog: using 1000 max_backlog
>> 2020-02-24T12:56:14.236Z|00127|unixctl|DBG|received request time/warp["30000","3000"], id=0
>> 2020-02-24T12:56:14.257Z|00128|memory|INFO|14512 kB peak resident set size after 12.2 seconds
>> 2020-02-24T12:56:14.257Z|00129|memory|INFO|handlers:5 ports:9 revalidators:3 rules:17
>> 2020-02-24T12:56:14.268Z|00130|ofproto_dpif|DBG|port p2: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00131|ofproto_dpif|DBG|port p1: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00132|ofproto_dpif|DBG|port p6: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00133|ofproto_dpif|DBG|port p5: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00134|ofproto_dpif|DBG|port p3: STP state changed from listening to learning
>> 2020-02-24T12:56:14.268Z|00135|ofproto_dpif|DBG|port p4: STP state changed from listening to learning
>> 2020-02-24T12:56:14.280Z|00001|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.280Z|00002|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>> 2020-02-24T12:56:14.323Z|00136|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00137|stp|INFO|br0: detected topology change.
>> 2020-02-24T12:56:14.323Z|00138|ofproto_dpif|DBG|port p2: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00139|ofproto_dpif|DBG|port p1: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00140|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00141|stp|INFO|br2: detected topology change.
>> 2020-02-24T12:56:14.323Z|00142|ofproto_dpif|DBG|port p6: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00143|ofproto_dpif|DBG|port p5: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00144|stp|INFO|br1: detected topology change.
>> 2020-02-24T12:56:14.323Z|00145|ofproto_dpif|DBG|port p3: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.323Z|00146|ofproto_dpif|DBG|port p4: STP state changed from learning to forwarding
>> 2020-02-24T12:56:14.334Z|00147|unixctl|DBG|replying with success, id=0: "warped"
> 2185. stp.at:467: 2185. STP - flush the fdb and mdb when topology
> changed (stp.at:467): FAILED (stp.at:529)
> 
>
Anton Ivanov Feb. 25, 2020, 8:02 a.m. UTC | #13
I have worked around it. Looking at the source I can guarantee that it is a work-around and not a 100% fix, but it has survived an overnight loop run on a 6 core/12 thread 3.5 GHz Ryzen.

The root cause is that instead of using a proper packet oriented interface (f.e SOCK_SEQPACKET or SOCK_DATAGRAM) interface, the dummy network device used in tests uses:

1. Unix Domain socket via the stream-unix/stream-fd which provides a stream abstraction, not a packet one.

2. Has no packet framing

3. Writes and waits on the same stream fd out of different threads.

Any one of these are not the right things to do. Unfortunately, fixing it properly will require both fixing netdev-dummy and a large number of tests which use it, so I have "taped over" it for the time being.

I will send the updated patchset as well as (hopefully) the first patches on top which multi-thread IO/SSL/JSON/(hopefully some processing) sometimes later this week.

Brgds,

A.

On 24/02/2020 14:59, Anton Ivanov wrote:
> This is not an stp failure, it is stp showing an underlying netdev-dummy issue similar to the netlink one earlier on.
>
> Brgds,
>
> A.
>
> On 24/02/2020 12:58, Dumitru Ceara wrote:
>> On Mon, Feb 24, 2020 at 11:47 AM Anton Ivanov
>> <anton.ivanov@cambridgegreys.com> wrote:
>>>
>>>
>>> On 19/02/2020 14:20, Dumitru Ceara wrote:
>>>> On 2/18/20 7:12 AM, Anton Ivanov wrote:
>>>>>
>>>>> On 17/02/2020 14:48, Dumitru Ceara wrote:
>>>>>> On 2/14/20 6:54 PM, anton.ivanov@cambridgegreys.com wrote:
>>>>>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>>>>
>>>>>>> 1. Adds "persistent" behaviour where feasible (streams and signals).
>>>>>>> These are waited upon in the same thread where they are created. This
>>>>>>> allows them to be registered persistently with the OS (if possible)
>>>>>>> as well as the OS to provide hints - is the FD ready, is it closed,
>>>>>>> etc.
>>>>>>>
>>>>>>> 2. Removes unnecessary attempts to perform a read vs EAGAIN on a fd
>>>>>>> which is not ready if that fd has been registered as "private" to the
>>>>>>> thread which waits upon it.
>>>>>>>
>>>>>>> 3. No longer breaks other parts of OVS which create the fd in one
>>>>>>> thread and waits upon it in others.
>>>>>>>
>>>>>>> 3. Adds support for EPOLL on Linux and can be expanded to cover similar
>>>>>>> poll++ frameworks in other OSes.
>>>>>>>
>>>>>>> 4. Sets up the necessary infrastructure to make IO/SSL multi-threaded
>>>>>>> using a "centeral (e)poll dispatcher + IO threads" pattern
>>>>>>>
>>>>>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>>>> Hi Anton,
>>>>>>
>>>>>> A couple of issues inline. Except for that:
>>>>>>
>>>>>> 1. The "STP - flush the fdb and mdb when topology changed" OVS test is
>>>>>> failing with your patches applied:
>>>>>>
>>>>>> make check TESTSUITEFLAGS='-k "flush the fdb"'
>>>>> I will have a look.
>>>
>>>
>>> I cannot reproduce that. It succeeds every time - 20 out of 20 runs. Can you send me some logs please?
>>>
>>>
>>
>> Hi Anton,
>>
>> After rebasing to latest upstream master I see the failure less often.
>> But it's still there.
>>
>> I pasted the testsuite below.
>>
>> Regards,
>> Dumitru
>>
>> $ cat tests/testsuite.dir/2185/testsuite.log
>> #                             -*- compilation -*-
>> 2185. stp.at:467: testing STP - flush the fdb and mdb when topology changed ...
>> ./stp.at:468: ovsdb-tool create conf.db
>> $abs_top_srcdir/vswitchd/vswitch.ovsschema
>> ./stp.at:468: ovsdb-server --detach --no-chdir --pidfile --log-file
>> --remote=punix:$OVS_RUNDIR/db.sock
>> stderr:
>> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
>> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
>> ./stp.at:468: sed < stderr '
>> /vlog|INFO|opened log file/d
>> /ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d'
>> ./stp.at:468: ovs-vsctl --no-wait init
>> ./stp.at:468: ovs-vswitchd --enable-dummy --disable-system
>> --disable-system-route  --detach --no-chdir --pidfile --log-file
>> -vvconn -vofproto_dpif -vunixctl
>> stderr:
>> 2020-02-24T12:56:14Z|00001|vlog|INFO|opened log file
>> /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
>> 2020-02-24T12:56:14Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
>> 2020-02-24T12:56:14Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
>> 2020-02-24T12:56:14Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
>> connecting...
>> 2020-02-24T12:56:14Z|00005|netlink_socket|INFO|netlink: could not
>> enable listening to all nsid (Operation not permitted)
>> 2020-02-24T12:56:14Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock:
>> connected
>> ./stp.at:468: sed < stderr '
>> /ovs_numa|INFO|Discovered /d
>> /vlog|INFO|opened log file/d
>> /vswitchd|INFO|ovs-vswitchd (Open vSwitch)/d
>> /reconnect|INFO|/d
>> /dpif_netlink|INFO|Generic Netlink family .ovs_datapath. does not exist/d
>> /ofproto|INFO|using datapath ID/d
>> /netdev_linux|INFO|.*device has unknown hardware address family/d
>> /ofproto|INFO|datapath ID changed to fedcba9876543210/d
>> /dpdk|INFO|DPDK Disabled - Use other_config:dpdk-init to enable/d
>> /netlink_socket|INFO|netlink: could not enable listening to all nsid/d
>> /probe tc:/d
>> /tc: Using policy/d'
>> ./stp.at:468: add_of_br 0
>> ./stp.at:471:
>>      ovs-vsctl -- \
>>      set port br0 other_config:stp-enable=false -- \
>>      set bridge br0 datapath-type=dummy -- \
>>      set bridge br0 stp_enable=true mcast_snooping_enable=true \
>>      other-config:hwaddr=aa:66:aa:66:00:00 -- \
>>      add-br br1 -- \
>>      set port br1 other_config:stp-enable=false -- \
>>      set bridge br1 datapath-type=dummy -- \
>>      set bridge br1 stp_enable=true mcast_snooping_enable=true \
>>      other-config:hwaddr=aa:66:aa:66:00:01 -- \
>>      add-br br2 -- \
>>      set port br2 other_config:stp-enable=false -- \
>>      set bridge br2 datapath-type=dummy -- \
>>      set bridge br2 stp_enable=true mcast_snooping_enable=true \
>>      other-config:hwaddr=aa:66:aa:66:00:02
>>
>> ./stp.at:489: ovs-appctl vlog/set ofproto_dpif:dbg
>> ./stp.at:490: ovs-appctl vlog/set ofproto_dpif_xlate:dbg
>> ./stp.at:492: ovs-ofctl add-flow br0 action=normal
>> ./stp.at:493: ovs-ofctl add-flow br1 action=normal
>> ./stp.at:494: ovs-ofctl add-flow br2 action=normal
>> ./stp.at:496:
>>      ovs-vsctl add-port br0 p1 -- \
>>          set interface p1 type=dummy
>> options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1
>>      ovs-vsctl add-port br0 p2 -- \
>>          set interface p2 type=dummy
>> options:stream=unix:$OVS_RUNDIR/p6.sock ofport_request=2
>>      ovs-vsctl add-port br1 p3 -- \
>>          set interface p3 type=dummy
>> options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3
>>      ovs-vsctl add-port br1 p4 -- \
>>          set interface p4 type=dummy
>> options:pstream=punix:$OVS_RUNDIR/p4.sock ofport_request=4
>>      ovs-vsctl add-port br2 p5 -- \
>>          set interface p5 type=dummy
>> options:stream=unix:$OVS_RUNDIR/p4.sock ofport_request=5
>>      ovs-vsctl add-port br2 p6 -- \
>>          set interface p6 type=dummy
>> options:pstream=punix:$OVS_RUNDIR/p6.sock ofport_request=6
>>
>> OK
>> warped
>> ./stp.at:517: cat ovs-vswitchd.log |
>> grep 'disabled to listening' | sed '
>>    s/.*ofproto_dpif|.*|port .*:/port <>:/
>> '
>> warped
>> ./stp.at:529: cat ovs-vswitchd.log |
>> grep 'learning to forwarding' | sed '
>>    s/.*ofproto_dpif|.*|port .*:/port <>:/
>> '
>> --- - 2020-02-24 13:56:14.337746686 +0100
>> +++ /home/dceara/git-repos/ovs/tests/testsuite.dir/at-groups/2185/stdout
>> 2020-02-24 13:56:14.000000000 +0100
>> @@ -3,4 +3,5 @@
>>   port <>: STP state changed from learning to forwarding
>>   port <>: STP state changed from learning to forwarding
>>   port <>: STP state changed from learning to forwarding
>> +port <>: STP state changed from learning to forwarding
>>
>> ovsdb-server.log:
>>> 2020-02-24T12:56:14.033Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovsdb-server.log
>>> 2020-02-24T12:56:14.038Z|00002|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.13.90
>> ovs-vswitchd.log:
>>> 2020-02-24T12:56:14.059Z|00001|vlog|INFO|opened log file /home/dceara/git-repos/ovs/tests/testsuite.dir/2185/ovs-vswitchd.log
>>> 2020-02-24T12:56:14.061Z|00002|ovs_numa|INFO|Discovered 8 CPU cores on NUMA node 0
>>> 2020-02-24T12:56:14.061Z|00003|ovs_numa|INFO|Discovered 1 NUMA nodes and 8 CPU cores
>>> 2020-02-24T12:56:14.061Z|00004|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connecting...
>>> 2020-02-24T12:56:14.061Z|00005|netlink_socket|INFO|netlink: could not enable listening to all nsid (Operation not permitted)
>>> 2020-02-24T12:56:14.061Z|00006|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/db.sock: connected
>>> 2020-02-24T12:56:14.065Z|00007|bridge|INFO|ovs-vswitchd (Open vSwitch) 2.13.90
>>> 2020-02-24T12:56:14.091Z|00008|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports recirculation
>>> 2020-02-24T12:56:14.091Z|00009|ofproto_dpif|INFO|dummy@ovs-dummy: VLAN header stack length probed as 1
>>> 2020-02-24T12:56:14.091Z|00010|ofproto_dpif|INFO|dummy@ovs-dummy: MPLS label stack length probed as 3
>>> 2020-02-24T12:56:14.091Z|00011|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports truncate action
>>> 2020-02-24T12:56:14.091Z|00012|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports unique flow ids
>>> 2020-02-24T12:56:14.091Z|00013|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports clone action
>>> 2020-02-24T12:56:14.091Z|00014|ofproto_dpif|INFO|dummy@ovs-dummy: Max sample nesting level probed as 10
>>> 2020-02-24T12:56:14.091Z|00015|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports eventmask in conntrack action
>>> 2020-02-24T12:56:14.091Z|00016|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_clear action
>>> 2020-02-24T12:56:14.091Z|00017|ofproto_dpif|INFO|dummy@ovs-dummy: Max dp_hash algorithm probed to be 1
>>> 2020-02-24T12:56:14.091Z|00018|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports check_pkt_len action
>>> 2020-02-24T12:56:14.091Z|00019|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports timeout policy in conntrack action
>>> 2020-02-24T12:56:14.091Z|00020|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state
>>> 2020-02-24T12:56:14.091Z|00021|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_zone
>>> 2020-02-24T12:56:14.091Z|00022|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_mark
>>> 2020-02-24T12:56:14.091Z|00023|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_label
>>> 2020-02-24T12:56:14.091Z|00024|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_state_nat
>>> 2020-02-24T12:56:14.091Z|00025|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple
>>> 2020-02-24T12:56:14.091Z|00026|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports ct_orig_tuple6
>>> 2020-02-24T12:56:14.091Z|00027|ofproto_dpif|INFO|dummy@ovs-dummy: Datapath supports IPv6 ND Extensions
>>> 2020-02-24T12:56:14.100Z|00028|bridge|INFO|bridge br0: added interface br0 on port 65534
>>> 2020-02-24T12:56:14.101Z|00029|bridge|INFO|bridge br0: using datapath ID fedcba9876543210
>>> 2020-02-24T12:56:14.101Z|00030|connmgr|INFO|br0: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br0.mgmt"
>>> 2020-02-24T12:56:14.113Z|00031|bridge|INFO|bridge br2: added interface br2 on port 65534
>>> 2020-02-24T12:56:14.113Z|00032|bridge|INFO|bridge br1: added interface br1 on port 65534
>>> 2020-02-24T12:56:14.113Z|00033|bridge|INFO|bridge br2: using datapath ID 0000aa66aa660002
>>> 2020-02-24T12:56:14.113Z|00034|connmgr|INFO|br2: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br2.mgmt"
>>> 2020-02-24T12:56:14.113Z|00035|bridge|INFO|bridge br1: using datapath ID 0000aa66aa660001
>>> 2020-02-24T12:56:14.113Z|00036|connmgr|INFO|br1: added service controller "punix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/br1.mgmt"
>>> 2020-02-24T12:56:14.120Z|00037|unixctl|DBG|received request vlog/set["ofproto_dpif:dbg"], id=0
>>> 2020-02-24T12:56:14.120Z|00038|unixctl|DBG|replying with success, id=0: ""
>>> 2020-02-24T12:56:14.124Z|00039|unixctl|DBG|received request vlog/set["ofproto_dpif_xlate:dbg"], id=0
>>> 2020-02-24T12:56:14.124Z|00040|unixctl|DBG|replying with success, id=0: ""
>>> 2020-02-24T12:56:14.129Z|00041|vconn|DBG|unix#2: sent (Success): OFPT_HELLO (OF1.5) (xid=0x1):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.129Z|00042|vconn|DBG|unix#2: received: OFPT_HELLO (xid=0x1):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.129Z|00043|vconn|DBG|unix#2: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.129Z|00044|vconn|DBG|unix#2: received: OFPST_TABLE request (xid=0x2):
>>> 2020-02-24T12:56:14.130Z|00045|vconn|DBG|unix#2: sent (Success): OFPST_TABLE reply (xid=0x2):
>>>    table 0:
>>>      active=0, lookup=0, matched=0
>>>      max_entries=1000000
>>>      matching:
>>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>>
>>>    tables 1...253: ditto
>>> 2020-02-24T12:56:14.130Z|00046|vconn|DBG|unix#3: sent (Success): OFPT_HELLO (OF1.5) (xid=0x2):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.130Z|00047|vconn|DBG|unix#3: received: OFPT_HELLO (xid=0x3):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.130Z|00048|vconn|DBG|unix#3: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.130Z|00049|vconn|DBG|unix#3: received: OFPT_FEATURES_REQUEST (xid=0x4):
>>> 2020-02-24T12:56:14.130Z|00050|vconn|DBG|unix#3: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:fedcba9876543210
>>> n_tables:254, n_buffers:0
>>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>>   LOCAL(br0): addr:aa:66:aa:66:00:00
>>>       config:     0
>>>       state:      0
>>>       speed: 0 Mbps now, 0 Mbps max
>>> 2020-02-24T12:56:14.131Z|00051|vconn|DBG|unix#4: sent (Success): OFPT_HELLO (OF1.5) (xid=0x3):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.131Z|00052|vconn|DBG|unix#4: received: OFPT_HELLO (xid=0x5):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.131Z|00053|vconn|DBG|unix#4: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.131Z|00054|vconn|DBG|unix#4: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>>> 2020-02-24T12:56:14.131Z|00055|vconn|DBG|unix#4: received: OFPT_BARRIER_REQUEST (xid=0x7):
>>> 2020-02-24T12:56:14.131Z|00056|vconn|DBG|unix#4: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>>> 2020-02-24T12:56:14.131Z|00057|connmgr|INFO|br0<->unix#4: 1 flow_mods in the last 0 s (1 adds)
>>> 2020-02-24T12:56:14.136Z|00058|vconn|DBG|unix#5: sent (Success): OFPT_HELLO (OF1.5) (xid=0x4):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.136Z|00059|vconn|DBG|unix#5: received: OFPT_HELLO (xid=0x1):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.136Z|00060|vconn|DBG|unix#5: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.136Z|00061|vconn|DBG|unix#5: received: OFPST_TABLE request (xid=0x2):
>>> 2020-02-24T12:56:14.136Z|00062|vconn|DBG|unix#5: sent (Success): OFPST_TABLE reply (xid=0x2):
>>>    table 0:
>>>      active=1, lookup=0, matched=0
>>>      max_entries=1000000
>>>      matching:
>>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>>
>>>    table 1:
>>>      active=0, lookup=0, matched=0
>>>      (same features)
>>>
>>>    tables 2...253: ditto
>>> 2020-02-24T12:56:14.137Z|00063|vconn|DBG|unix#6: sent (Success): OFPT_HELLO (OF1.5) (xid=0x5):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.137Z|00064|vconn|DBG|unix#6: received: OFPT_HELLO (xid=0x3):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.137Z|00065|vconn|DBG|unix#6: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.137Z|00066|vconn|DBG|unix#6: received: OFPT_FEATURES_REQUEST (xid=0x4):
>>> 2020-02-24T12:56:14.137Z|00067|vconn|DBG|unix#6: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660001
>>> n_tables:254, n_buffers:0
>>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>>   LOCAL(br1): addr:aa:66:aa:66:00:01
>>>       config:     0
>>>       state:      0
>>>       speed: 0 Mbps now, 0 Mbps max
>>> 2020-02-24T12:56:14.137Z|00068|vconn|DBG|unix#7: sent (Success): OFPT_HELLO (OF1.5) (xid=0x6):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.137Z|00069|vconn|DBG|unix#7: received: OFPT_HELLO (xid=0x5):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.137Z|00070|vconn|DBG|unix#7: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.138Z|00071|vconn|DBG|unix#7: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>>> 2020-02-24T12:56:14.138Z|00072|vconn|DBG|unix#7: received: OFPT_BARRIER_REQUEST (xid=0x7):
>>> 2020-02-24T12:56:14.138Z|00073|vconn|DBG|unix#7: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>>> 2020-02-24T12:56:14.138Z|00074|connmgr|INFO|br1<->unix#7: 1 flow_mods in the last 0 s (1 adds)
>>> 2020-02-24T12:56:14.143Z|00075|vconn|DBG|unix#8: sent (Success): OFPT_HELLO (OF1.5) (xid=0x7):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.143Z|00076|vconn|DBG|unix#8: received: OFPT_HELLO (xid=0x1):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.143Z|00077|vconn|DBG|unix#8: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.143Z|00078|vconn|DBG|unix#8: received: OFPST_TABLE request (xid=0x2):
>>> 2020-02-24T12:56:14.143Z|00079|vconn|DBG|unix#8: sent (Success): OFPST_TABLE reply (xid=0x2):
>>>    table 0:
>>>      active=1, lookup=0, matched=0
>>>      max_entries=1000000
>>>      matching:
>>>        exact match or wildcard: in_port eth_{src,dst,type} vlan_{vid,pcp} ip_{src,dst} nw_{proto,tos} tcp_{src,dst}
>>>
>>>    table 1:
>>>      active=0, lookup=0, matched=0
>>>      (same features)
>>>
>>>    tables 2...253: ditto
>>> 2020-02-24T12:56:14.144Z|00080|vconn|DBG|unix#9: sent (Success): OFPT_HELLO (OF1.5) (xid=0x8):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.144Z|00081|vconn|DBG|unix#9: received: OFPT_HELLO (xid=0x3):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.144Z|00082|vconn|DBG|unix#9: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.144Z|00083|vconn|DBG|unix#9: received: OFPT_FEATURES_REQUEST (xid=0x4):
>>> 2020-02-24T12:56:14.144Z|00084|vconn|DBG|unix#9: sent (Success): OFPT_FEATURES_REPLY (xid=0x4): dpid:0000aa66aa660002
>>> n_tables:254, n_buffers:0
>>> capabilities: FLOW_STATS TABLE_STATS PORT_STATS QUEUE_STATS ARP_MATCH_IP
>>> actions: output enqueue set_vlan_vid set_vlan_pcp strip_vlan mod_dl_src mod_dl_dst mod_nw_src mod_nw_dst mod_nw_tos mod_tp_src mod_tp_dst
>>>   LOCAL(br2): addr:aa:66:aa:66:00:02
>>>       config:     0
>>>       state:      0
>>>       speed: 0 Mbps now, 0 Mbps max
>>> 2020-02-24T12:56:14.144Z|00085|vconn|DBG|unix#10: sent (Success): OFPT_HELLO (OF1.5) (xid=0x9):
>>>   version bitmap: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06
>>> 2020-02-24T12:56:14.144Z|00086|vconn|DBG|unix#10: received: OFPT_HELLO (xid=0x5):
>>>   version bitmap: 0x01
>>> 2020-02-24T12:56:14.144Z|00087|vconn|DBG|unix#10: negotiated OpenFlow version 0x01 (we support version 0x06 and earlier, peer supports version 0x01)
>>> 2020-02-24T12:56:14.145Z|00088|vconn|DBG|unix#10: received: OFPT_FLOW_MOD (xid=0x6): ADD actions=NORMAL
>>> 2020-02-24T12:56:14.145Z|00089|vconn|DBG|unix#10: received: OFPT_BARRIER_REQUEST (xid=0x7):
>>> 2020-02-24T12:56:14.145Z|00090|vconn|DBG|unix#10: sent (Success): OFPT_BARRIER_REPLY (xid=0x7):
>>> 2020-02-24T12:56:14.145Z|00091|connmgr|INFO|br2<->unix#10: 1 flow_mods in the last 0 s (1 adds)
>>> 2020-02-24T12:56:14.154Z|00092|bridge|INFO|bridge br0: added interface p1 on port 1
>>> 2020-02-24T12:56:14.154Z|00093|ofproto_dpif|DBG|port p1: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.164Z|00094|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.164Z|00095|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>>> 2020-02-24T12:56:14.164Z|00096|bridge|INFO|bridge br0: added interface p2 on port 2
>>> 2020-02-24T12:56:14.164Z|00097|ofproto_dpif|DBG|port p2: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.174Z|00098|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.174Z|00099|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>>> 2020-02-24T12:56:14.174Z|00100|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connecting...
>>> 2020-02-24T12:56:14.174Z|00101|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p1.sock: connected
>>> 2020-02-24T12:56:14.174Z|00102|bridge|INFO|bridge br1: added interface p3 on port 3
>>> 2020-02-24T12:56:14.174Z|00103|ofproto_dpif|DBG|port p3: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.183Z|00104|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.183Z|00105|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>>> 2020-02-24T12:56:14.184Z|00106|bridge|INFO|bridge br1: added interface p4 on port 4
>>> 2020-02-24T12:56:14.184Z|00107|ofproto_dpif|DBG|port p4: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.193Z|00108|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.193Z|00109|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>>> 2020-02-24T12:56:14.193Z|00110|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connecting...
>>> 2020-02-24T12:56:14.193Z|00111|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p4.sock: connected
>>> 2020-02-24T12:56:14.193Z|00112|bridge|INFO|bridge br2: added interface p5 on port 5
>>> 2020-02-24T12:56:14.193Z|00113|ofproto_dpif|DBG|port p5: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.203Z|00114|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.203Z|00115|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connection attempt failed (No such file or directory)
>>> 2020-02-24T12:56:14.203Z|00116|bridge|INFO|bridge br2: added interface p6 on port 6
>>> 2020-02-24T12:56:14.203Z|00117|ofproto_dpif|DBG|port p6: STP state changed from disabled to listening
>>> 2020-02-24T12:56:14.208Z|00118|unixctl|DBG|received request netdev-dummy/set-admin-state["up"], id=0
>>> 2020-02-24T12:56:14.208Z|00119|unixctl|DBG|replying with success, id=0: "OK"
>>> 2020-02-24T12:56:14.212Z|00120|unixctl|DBG|received request time/stop[], id=0
>>> 2020-02-24T12:56:14.212Z|00121|unixctl|DBG|replying with success, id=0: ""
>>> 2020-02-24T12:56:14.215Z|00122|unixctl|DBG|received request time/warp["6000","3000"], id=0
>>> 2020-02-24T12:56:14.225Z|00123|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connecting...
>>> 2020-02-24T12:56:14.225Z|00124|reconnect|INFO|unix:/home/dceara/git-repos/ovs/tests/testsuite.dir/2185/p6.sock: connected
>>> 2020-02-24T12:56:14.225Z|00125|unixctl|DBG|replying with success, id=0: "warped"
>>> 2020-02-24T12:56:14.236Z|00126|ofproto_dpif_xlate|DBG|/proc/sys/net/core/netdev_max_backlog: using 1000 max_backlog
>>> 2020-02-24T12:56:14.236Z|00127|unixctl|DBG|received request time/warp["30000","3000"], id=0
>>> 2020-02-24T12:56:14.257Z|00128|memory|INFO|14512 kB peak resident set size after 12.2 seconds
>>> 2020-02-24T12:56:14.257Z|00129|memory|INFO|handlers:5 ports:9 revalidators:3 rules:17
>>> 2020-02-24T12:56:14.268Z|00130|ofproto_dpif|DBG|port p2: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.268Z|00131|ofproto_dpif|DBG|port p1: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.268Z|00132|ofproto_dpif|DBG|port p6: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.268Z|00133|ofproto_dpif|DBG|port p5: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.268Z|00134|ofproto_dpif|DBG|port p3: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.268Z|00135|ofproto_dpif|DBG|port p4: STP state changed from listening to learning
>>> 2020-02-24T12:56:14.280Z|00001|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>>> 2020-02-24T12:56:14.280Z|00002|ovs_rcu(urcu3)|WARN|blocked 3000 ms waiting for main to quiesce
>>> 2020-02-24T12:56:14.323Z|00136|stp|INFO|br0: detected topology change.
>>> 2020-02-24T12:56:14.323Z|00137|stp|INFO|br0: detected topology change.
>>> 2020-02-24T12:56:14.323Z|00138|ofproto_dpif|DBG|port p2: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.323Z|00139|ofproto_dpif|DBG|port p1: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.323Z|00140|stp|INFO|br2: detected topology change.
>>> 2020-02-24T12:56:14.323Z|00141|stp|INFO|br2: detected topology change.
>>> 2020-02-24T12:56:14.323Z|00142|ofproto_dpif|DBG|port p6: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.323Z|00143|ofproto_dpif|DBG|port p5: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.323Z|00144|stp|INFO|br1: detected topology change.
>>> 2020-02-24T12:56:14.323Z|00145|ofproto_dpif|DBG|port p3: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.323Z|00146|ofproto_dpif|DBG|port p4: STP state changed from learning to forwarding
>>> 2020-02-24T12:56:14.334Z|00147|unixctl|DBG|replying with success, id=0: "warped"
>> 2185. stp.at:467: 2185. STP - flush the fdb and mdb when topology
>> changed (stp.at:467): FAILED (stp.at:529)
>>
>>
>

Patch
diff mbox series

diff --git a/include/openvswitch/poll-loop.h b/include/openvswitch/poll-loop.h
index 532d9caa6..6d0331f6d 100644
--- a/include/openvswitch/poll-loop.h
+++ b/include/openvswitch/poll-loop.h
@@ -41,11 +41,30 @@ 
 #include <windows.h>
 #endif
 
+#ifdef __linux__
+#define OVS_USE_EPOLL
+#endif
+
+#ifdef OVS_USE_EPOLL
+#include <sys/epoll.h>
+
+#define OVS_POLLIN EPOLLIN
+#define OVS_POLLOUT EPOLLOUT
+#define OVS_POLLERR EPOLLERR
+#define OVS_POLLHUP EPOLLHUP
+#define OVS_ONESHOT EPOLLONESHOT
+#define OVS_POLLNVAL 0
+
+#else
+
 #define OVS_POLLIN POLLIN
 #define OVS_POLLOUT POLLOUT
 #define OVS_POLLERR POLLERR
 #define OVS_POLLNVAL POLLNVAL
 #define OVS_POLLHUP POLLHUP
+#define OVS_ONESHOT (1U << 30)
+
+#endif 
 
 #ifdef  __cplusplus
 extern "C" {
@@ -60,10 +79,43 @@  extern "C" {
  * the source code location of the caller.  The function version allows the
  * caller to supply a location explicitly, which is useful if the caller's own
  * caller would be more useful in log output.  See timer_wait_at() for an
- * example. */
-void poll_fd_wait_at(int fd, short int events, const char *where);
+ * example.
+ * Note - using on fds registered using poll_fd_register() will generate a
+ * warning as this is not an intended use.
+ */
+void poll_fd_wait_at(int fd, int events, const char *where);
 #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
 
+/* Register a fd with a persistence framework if available so it can be served
+ * "faster" and the caller can be provided with "hints" on what caused the IO
+ * event.
+ * If the "hint" argument is supplied it set to point to the pollfd structure
+ * containing the events passed by the OS in .revents. 
+ * Note - as the frameworks are OS dependent, the events are limited to what
+ * can be passed in a .revents which is a short int.
+ * Limitations - MUST BE registered from the same thread as the one where 
+ * it will be waited upon.
+ */
+
+void poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where);
+#define poll_fd_register(fd, events, hint) poll_fd_register_at(fd, events, hint, OVS_SOURCE_LOCATOR)
+
+/* De-register a fd which was registered as "private" with the persistence
+ * framework
+ */
+
+void poll_fd_deregister_at(int fd, const char *where);
+#define poll_fd_deregister(fd) poll_fd_deregister_at(fd, OVS_SOURCE_LOCATOR)
+
+/* Schedule events to wake up the following poll_block() - "private fds"
+ * Same as poll_fd_wait, but for fds which have been registered and are
+ * expected to persist. If a "fast" OS fd notification framework is used
+ * this version of wait may be a NOOP (f.e. for (E)POLLIN events.
+ */
+void private_poll_fd_wait_at(int fd, int events, const char *where);
+#define private_poll_fd_wait(fd, events) private_poll_fd_wait_at(fd, events, OVS_SOURCE_LOCATOR)
+
+
 #ifdef _WIN32
 void poll_wevent_wait_at(HANDLE wevent, const char *where);
 #define poll_wevent_wait(wevent) poll_wevent_wait_at(wevent, OVS_SOURCE_LOCATOR)
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 5b5c96d72..ad5db9452 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1289,7 +1289,7 @@  dpif_netlink_port_poll_wait(const struct dpif *dpif_)
     const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
 
     if (dpif->port_notifier) {
-        nl_sock_wait(dpif->port_notifier, POLLIN);
+        nl_sock_wait(dpif->port_notifier, OVS_POLLIN);
     } else {
         poll_immediate_wake();
     }
@@ -2756,13 +2756,13 @@  dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
     }
 
     for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
-        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
+        nl_sock_wait(sock_pool[i].nl_sock, OVS_POLLIN);
     }
 #else
     if (dpif->handlers && handler_id < dpif->n_handlers) {
         struct dpif_handler *handler = &dpif->handlers[handler_id];
 
-        poll_fd_wait(handler->epoll_fd, POLLIN);
+        poll_fd_wait(handler->epoll_fd, OVS_POLLIN);
     }
 #endif
 }
diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
index 97d8d1dab..424636e07 100644
--- a/lib/fatal-signal.c
+++ b/lib/fatal-signal.c
@@ -96,6 +96,7 @@  fatal_signal_init(void)
         ovs_mutex_init_recursive(&mutex);
 #ifndef _WIN32
         xpipe_nonblocking(signal_fds);
+        poll_fd_register(signal_fds[0], OVS_POLLIN, NULL);
 #else
         wevent = CreateEvent(NULL, TRUE, FALSE, NULL);
         if (!wevent) {
@@ -236,9 +237,12 @@  void
 fatal_signal_run(void)
 {
     sig_atomic_t sig_nr;
+    char sigbuffer[_POSIX_PIPE_BUF];
 
     fatal_signal_init();
 
+    read(signal_fds[0], sigbuffer, sizeof(sigbuffer));
+
     sig_nr = stored_sig_nr;
     if (sig_nr != SIG_ATOMIC_MAX) {
         char namebuf[SIGNAL_NAME_BUFSIZE];
@@ -271,7 +275,8 @@  fatal_signal_wait(void)
 #ifdef _WIN32
     poll_wevent_wait(wevent);
 #else
-    poll_fd_wait(signal_fds[0], OVS_POLLIN);
+    /* a noop - schedule for removal */
+    private_poll_fd_wait(signal_fds[0], OVS_POLLIN);
 #endif
 }
 
diff --git a/lib/latch-unix.c b/lib/latch-unix.c
index fea61ab28..5f15b59fe 100644
--- a/lib/latch-unix.c
+++ b/lib/latch-unix.c
@@ -83,5 +83,6 @@  latch_is_set(const struct latch *latch)
 void
 latch_wait_at(const struct latch *latch, const char *where)
 {
-    poll_fd_wait_at(latch->fds[0], OVS_POLLIN, where);
+    /* Ask for wait and make it one-shot if persistence is in play */
+    poll_fd_wait_at(latch->fds[0], OVS_POLLIN | OVS_ONESHOT, where);
 }
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index ef367e5ea..482400d8d 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -184,7 +184,7 @@  xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
 
     if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
         pfd.fd = fd;
-        pfd.events = OVS_POLLIN;
+        pfd.events = POLLIN;
 
         ret = poll(&pfd, 1, 0);
         if (OVS_UNLIKELY(ret < 0)) {
diff --git a/lib/poll-loop.c b/lib/poll-loop.c
index 3902d6c1f..10a5b0c01 100644
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -18,6 +18,12 @@ 
 #include "openvswitch/poll-loop.h"
 #include <errno.h>
 #include <inttypes.h>
+#ifdef OVS_USE_EPOLL
+#include <sys/epoll.h>
+#endif
+#ifndef _WIN32
+#include <unistd.h>
+#endif
 #include <poll.h>
 #include <stdlib.h>
 #include <string.h>
@@ -31,7 +37,9 @@ 
 #include "timeval.h"
 #include "openvswitch/vlog.h"
 #include "openvswitch/hmap.h"
+#include "openvswitch/list.h"
 #include "hash.h"
+#include "ovs-atomic.h"
 
 VLOG_DEFINE_THIS_MODULE(poll_loop);
 
@@ -43,21 +51,32 @@  struct poll_node {
     struct pollfd pollfd;       /* Events to pass to time_poll(). */
     HANDLE wevent;              /* Events for WaitForMultipleObjects(). */
     const char *where;          /* Where poll_node was created. */
+    bool valid;                 /* Can it be used? */
+    bool private;               /* Can we assume that it is only in this thread poll loop? */
 };
 
+#define MAX_EPOLL_EVENTS 64
+
 struct poll_loop {
-    /* All active poll waiters. */
+    /* List of all poll loops in the system */
+    struct ovs_mutex loop_mutex;
+    /* All poll waiters for this poll loop */
     struct hmap poll_nodes;
 
     /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
      * wake up immediately, or LLONG_MAX to wait forever. */
     long long int timeout_when; /* In msecs as returned by time_msec(). */
     const char *timeout_where;  /* Where 'timeout_when' was set. */
+#ifdef OVS_USE_EPOLL
+    int epoll_fd;
+    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+#endif
 };
 
+
 static struct poll_loop *poll_loop(void);
 
-/* Look up the node with same fd or wevent. */
+/* Look up the node with same fd or wevent - should be accessed under &loop->mutex. */
 static struct poll_node *
 find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
 {
@@ -76,79 +95,142 @@  find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
     }
     return NULL;
 }
-
-/* On Unix based systems:
- *
- *     Registers 'fd' as waiting for the specified 'events' (which should be
- *     OVS_POLLIN or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to
- *     poll_block() will wake up when 'fd' becomes ready for one or more of the
- *     requested events. The 'fd's are given to poll() function later.
- *
- * On Windows system:
+/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
+ * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
+ * wake up when 'fd' becomes ready for one or more of the requested events.
  *
- *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
- *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
- *     it is assumed that it is unrelated to any sockets and poll_block()
- *     will wake up on any event on that 'wevent'. It is an error to pass
- *     both 'wevent' and 'fd'.
+ * The event registration is PERSISTENT. This is intended for OSes which have a persistent
+ * event framework. For now it is implemented only for epoll and Linux, other
+ * implementations such as BSD kqueue and Solaris /dev/poll may follow.
  *
- * The event registration is one-shot: only the following call to
- * poll_block() is affected.  The event will need to be re-registered after
- * poll_block() is called if it is to persist.
+ * If the OS has no persistent even framework does nothing
  *
  * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
  * automatically provide the caller's source file and line number for
  * 'where'.) */
+
 static void
-poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
+poll_fd_subscribe_at(int fd, HANDLE wevent, int events, struct pollfd **hint, const char *where, bool private)
 {
     struct poll_loop *loop = poll_loop();
     struct poll_node *node;
+#ifdef OVS_USE_EPOLL
+    struct epoll_event event;
+#endif
 
-    COVERAGE_INC(poll_create_node);
-
-    /* Both 'fd' and 'wevent' cannot be set. */
     ovs_assert(!fd != !wevent);
 
+    /* This is mostly uncontended, so the thread should grab it straight away.
+     * We will reuse it later to introduce threading for IO and SSL
+     */
+    ovs_mutex_lock(&loop->loop_mutex);
+
     /* Check for duplicate.  If found, "or" the events. */
     node = find_poll_node(loop, fd, wevent);
-    if (node) {
-        node->pollfd.events |= events;
-    } else {
-        node = xzalloc(sizeof *node);
-        hmap_insert(&loop->poll_nodes, &node->hmap_node,
-                    hash_2words(fd, (uint32_t)wevent));
-        node->pollfd.fd = fd;
-        node->pollfd.events = events;
-#ifdef _WIN32
-        if (!wevent) {
-            wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
+
+    if (node && node->valid) {
+#ifdef OVS_USE_EPOLL
+        int old_event_mask = node->pollfd.events;
+#endif
+        /* If there is an existing event mask we do not need to inc - this will be waited upon */
+        node->pollfd.events |= (events & 0x0000FFFF); /* or without epoll specific bits */
+
+#ifdef OVS_USE_EPOLL
+        /* modify existing epoll entry if there is an epoll specific ask or if the
+         * mask has changed
+         */
+        if ((events & 0xFFFF0000) || (old_event_mask != node->pollfd.events)) {
+            event.events = node->pollfd.events | events | EPOLLHUP | EPOLLRDHUP;
+            event.data.ptr = node;
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
         }
 #endif
+    } else {
+        if (!node) {
+            node = xzalloc(sizeof *node);
+            hmap_insert(&loop->poll_nodes, &node->hmap_node,
+                        hash_2words(fd, 0));
+        } else {
+            /* node marked for reaping, OS has reused the fd number, valid is set to false */
+#ifdef OVS_USE_EPOLl
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+#endif
+        }
+        node->pollfd.fd = fd;
+        node->pollfd.events = (events & 0x0000FFFF);
         node->wevent = wevent;
         node->where = where;
+        node->valid = true;
+        node->private = private;
+#ifdef OVS_USE_EPOLL
+        event.events = node->pollfd.events | EPOLLHUP | EPOLLRDHUP; /* we always listen for fd close */
+        event.data.ptr = node;
+        epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
+#endif
+    }
+    if (hint) {
+        *hint = &node->pollfd;
     }
+    ovs_mutex_unlock(&loop->loop_mutex);
+}
+
+void
+poll_fd_register_at(int fd, int events, struct pollfd **hint, const char *where) {
+    poll_fd_subscribe_at(fd, 0, events, hint, where , true);
+}
+
+/* Deregisters a fd. Note - this looks like a memory leak (deallocating only private fds)
+ * but it is not.
+ * In order to be compatible with existing calling conventions while using fd persistence
+ * where supported we have to keep "legacy" fds around for the duration of the life of
+ * the thread because we have no idea if they have been reaped properly or not.
+ * The reason for this is that for some of them the close() is in a thread different from the
+ * poll loop.
+ * Thus, the only thing we can do in this case is mark them "invalid". Once the OS reuses the
+ * same fd number, we will reuse the existing has entry.
+ */
+
+void
+poll_fd_deregister_at(int fd, const char *where) {
+    struct poll_loop *loop = poll_loop();
+
+    VLOG(VLL_DBG, "Deregister %d from %s", fd, where);
+    struct poll_node *node;
+
+    ovs_mutex_lock(&loop->loop_mutex);
+    node = find_poll_node(loop, fd, 0);
+    if (node) {
+        if (node->private) {
+#ifdef OVN_USE_EPOLL
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+#endif
+            hmap_remove(&loop->poll_nodes, &node->hmap_node);
+        } else {
+            VLOG(VLL_WARN, "Trying to deregister a non-private %d from %s", fd, where);
+            node->valid = false;
+        }
+    }
+    ovs_mutex_unlock(&loop->loop_mutex);
+}
+
+void
+poll_fd_wait_at(int fd, int events, const char *where)
+{
+    poll_fd_subscribe_at(fd, 0, events, NULL, where, false);
 }
 
-/* Registers 'fd' as waiting for the specified 'events' (which should be OVS_POLLIN
- * or OVS_POLLOUT or OVS_POLLIN | OVS_POLLOUT).  The following call to poll_block() will
- * wake up when 'fd' becomes ready for one or more of the requested events.
- *
- * On Windows, 'fd' must be a socket.
- *
- * The event registration is one-shot: only the following call to poll_block()
- * is affected.  The event will need to be re-registered after poll_block() is
- * called if it is to persist.
- *
- * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
- * automatically provide the caller's source file and line number for
- * 'where'.) */
 void
-poll_fd_wait_at(int fd, short int events, const char *where)
+private_poll_fd_wait_at(int fd, int events, const char *where)
 {
-    poll_create_node(fd, 0, events, where);
+    /* POLLIN persists on "private" fds - either emulated or at epoll
+     * or other persistence framework level
+     */
+    if (events & (~OVS_POLLIN)) {
+        poll_fd_subscribe_at(fd, 0, events, NULL, where, true);
+    }
 }
 
+
 #ifdef _WIN32
 /* Registers for the next call to poll_block() to wake up when 'wevent' is
  * signaled.
@@ -163,7 +245,7 @@  poll_fd_wait_at(int fd, short int events, const char *where)
 void
 poll_wevent_wait_at(HANDLE wevent, const char *where)
 {
-    poll_create_node(0, wevent, 0, where);
+    poll_fd_subscribe_at(0, wevent, 0, NULL, where);
 }
 #endif /* _WIN32 */
 
@@ -277,9 +359,12 @@  log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
         if (pollfd->revents & OVS_POLLHUP) {
             ds_put_cstr(&s, "[OVS_POLLHUP]");
         }
+#ifndef OVS_USE_EPOLL
+        /* epoll does not have NVAL - it uses RDHUP and HUP which we cannot actually get to here*/
         if (pollfd->revents & OVS_POLLNVAL) {
             ds_put_cstr(&s, "[OVS_POLLNVAL]");
         }
+#endif
         ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
         free(description);
     } else {
@@ -295,12 +380,17 @@  log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
     ds_destroy(&s);
 }
 
+
 static void
 free_poll_nodes(struct poll_loop *loop)
 {
     struct poll_node *node, *next;
 
+    ovs_mutex_lock(&loop->loop_mutex);
     HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
+#ifdef OVS_USE_EPOLL
+        epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+#endif
         hmap_remove(&loop->poll_nodes, &node->hmap_node);
 #ifdef _WIN32
         if (node->wevent && node->pollfd.fd) {
@@ -310,6 +400,7 @@  free_poll_nodes(struct poll_loop *loop)
 #endif
         free(node);
     }
+    ovs_mutex_unlock(&loop->loop_mutex);
 }
 
 /* Blocks until one or more of the events registered with poll_fd_wait()
@@ -320,8 +411,13 @@  poll_block(void)
 {
     struct poll_loop *loop = poll_loop();
     struct poll_node *node;
+#ifndef OVS_USE_EPOLL
     struct pollfd *pollfds;
+#endif
+#ifndef OVS_USE_EPOLL
     HANDLE *wevents = NULL;
+    int counter;
+#endif
     int elapsed;
     int retval;
     int i;
@@ -335,54 +431,126 @@  poll_block(void)
     }
 
     timewarp_run();
-    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
 
+#ifdef OVS_USE_EPOLL
+    retval = time_epoll_wait(loop->epoll_fd,
+        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
+    if (retval < 0) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_ERR_RL(&rl, "epoll: %s", ovs_strerror(retval));
+    } else if (!retval) {
+        log_wakeup(loop->timeout_where, NULL, elapsed);
+    } else {
+        ovs_mutex_lock(&loop->loop_mutex);
+        if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
+            for (i = 0; i < retval; i++) {
+                node = (struct poll_node *) loop->epoll_events[i].data.ptr;
+                if (loop->epoll_events[i].events) {
+                    node->pollfd.revents = loop->epoll_events[i].events;
+                    log_wakeup(node->where, &node->pollfd, 0);
+                }
+            }
+        }
+        for (i = 0; i < retval; i++) {
+            node = (struct poll_node *) loop->epoll_events[i].data.ptr;
+            if (loop->epoll_events[i].events & EPOLLHUP) {
+                /* File descriptor closed already elsewhere
+                 * We have to make the assumption that whoever closed it has
+                 * ensured that anything which refers to IO event hints will not run
+                 * on this fd after we free it.
+                 */
+                node->valid = false;
+            }
+            if (loop->epoll_events[i].events) {
+                node->pollfd.revents |= (loop->epoll_events[i].events & 0x0000FFFF);
+            }
+            if (loop->epoll_events[i].events & OVS_POLLOUT) {
+                struct epoll_event event;
+                node->pollfd.events = OVS_POLLIN; /* reset back to defaults - write needs one shot */
+                event.events = node->pollfd.events;
+                event.data.ptr = node;
+                epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, node->pollfd.fd, &event);
+            }
+        }
+        ovs_mutex_unlock(&loop->loop_mutex);
+    }
+#else
+    pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
 #ifdef _WIN32
     wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
 #endif
 
+
     /* Populate with all the fds and events. */
-    i = 0;
+    counter = 0;
     HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
-        pollfds[i] = node->pollfd;
+        if ((node->valid) && (node->pollfd.events)) {
+            pollfds[counter] = node->pollfd;
 #ifdef _WIN32
-        wevents[i] = node->wevent;
-        if (node->pollfd.fd && node->wevent) {
-            short int wsa_events = 0;
-            if (node->pollfd.events & OVS_POLLIN) {
-                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
+            wevents[counter] = node->wevent;
+            if (node->pollfd.fd && node->wevent) {
+                short int wsa_events = 0;
+                if (node->pollfd.events & OVS_POLLIN) {
+                    wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
+                }
+                if (node->pollfd.events & OVS_POLLOUT) {
+                    wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
+                }
+                WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
             }
-            if (node->pollfd.events & OVS_POLLOUT) {
-                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
-            }
-            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
-        }
 #endif
-        i++;
+            counter++;
+        }
     }
 
-    retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
+    retval = time_poll(pollfds, counter, wevents,
                        loop->timeout_when, &elapsed);
     if (retval < 0) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
         VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
-    } else if (!retval) {
+    } else if (retval == 0) {
         log_wakeup(loop->timeout_where, NULL, elapsed);
-    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
-        i = 0;
-        HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
+    } else {
+        for (i = 0; i < counter; i++) {
             if (pollfds[i].revents) {
-                log_wakeup(node->where, &pollfds[i], 0);
+
+                node = find_poll_node(loop, pollfds[i].fd, 0);
+
+                if (!node) {
+                    VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
+                }
+                if (pollfds[i].revents & (OVS_POLLHUP | OVS_POLLNVAL)) {
+                    node->valid = false;
+                }
+
+                if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
+                    log_wakeup(node->where, &pollfds[i], 0);
+                }
+                /* update "requested" events. 
+                 * Note - "private" fds always want POLLIN - that emulates EPOLL, /dev/poll, etc
+                 * behaviour which they should be using in real life instead of using poll()
+                 */
+                if (node->private) {
+                    node->pollfd.events &= ~(pollfds[i].revents & (~OVS_POLLIN));
+                } else {
+                    node->pollfd.events &= ~pollfds[i].revents;
+                }
+                /* update "occured" events for use by streams and handlers. In case there
+                 * is an existing (but not consumed yet) event, we OR the events in the
+                 * stored record with the new ones - it is the job of the stream to clear
+                 * that.
+                 */
+                node->pollfd.revents |= pollfds[i].revents;
             }
-            i++;
         }
     }
 
-    free_poll_nodes(loop);
+    free(pollfds);
+    if (wevents)
+        free(wevents);
+#endif
     loop->timeout_when = LLONG_MAX;
     loop->timeout_where = NULL;
-    free(pollfds);
-    free(wevents);
 
     /* Handle any pending signals before doing anything else. */
     fatal_signal_run();
@@ -416,8 +584,12 @@  poll_loop(void)
     if (!loop) {
         loop = xzalloc(sizeof *loop);
         loop->timeout_when = LLONG_MAX;
+        ovs_mutex_init(&loop->loop_mutex);
         hmap_init(&loop->poll_nodes);
         xpthread_setspecific(key, loop);
+#ifdef OVS_USE_EPOLL
+        loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
+#endif
     }
     return loop;
 }
diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
index 3dfa80c7f..16d155989 100644
--- a/lib/route-table-bsd.c
+++ b/lib/route-table-bsd.c
@@ -34,6 +34,7 @@ 
 #include "ovs-router.h"
 #include "packets.h"
 #include "openvswitch/vlog.h"
+#include "openvswitch/poll-loop.h"
 #include "util.h"
 
 VLOG_DEFINE_THIS_MODULE(route_table_bsd);
diff --git a/lib/stream-fd.c b/lib/stream-fd.c
index 62f768d45..6a80d6e05 100644
--- a/lib/stream-fd.c
+++ b/lib/stream-fd.c
@@ -40,6 +40,8 @@  struct stream_fd
     struct stream stream;
     int fd;
     int fd_type;
+    bool rx_ready, tx_ready;
+    struct pollfd *hint;
 };
 
 static const struct stream_class stream_fd_class;
@@ -67,7 +69,14 @@  new_fd_stream(char *name, int fd, int connect_status, int fd_type,
     stream_init(&s->stream, &stream_fd_class, connect_status, name);
     s->fd = fd;
     s->fd_type = fd_type;
+    s->rx_ready = true;
+    s->tx_ready = true;
+    s->hint = NULL;
     *streamp = &s->stream;
+    /* Persistent registration - we always get POLLINs from now on,
+     * POLLOUTs when we ask for them
+     */
+    poll_fd_register(s->fd, OVS_POLLIN, &s->hint);
     return 0;
 }
 
@@ -82,6 +91,8 @@  static void
 fd_close(struct stream *stream)
 {
     struct stream_fd *s = stream_fd_cast(stream);
+    /* Deregister the FD from any persistent registrations if supported */
+    poll_fd_deregister(s->fd);
     closesocket(s->fd);
     free(s);
 }
@@ -104,6 +115,24 @@  fd_recv(struct stream *stream, void *buffer, size_t n)
     ssize_t retval;
     int error;
 
+    if (s->hint) {
+        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
+         * to the read which should return 0 if the HUP is a real one, if not we clear it
+         * for all other cases we belive what (e)poll has fed us.
+         */
+        if ((!(s->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (!s->rx_ready)) {
+            if (!(s->hint->revents & OVS_POLLIN)) {
+                return -EAGAIN;
+            } else {
+                /* POLLIN event from poll loop, mark us as ready */
+                s->rx_ready = true;
+                s->hint->revents &= ~OVS_POLLIN;
+            }
+        } else {
+            s->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
+        }
+    }
+
     retval = recv(s->fd, buffer, n, 0);
     if (retval < 0) {
         error = sock_errno();
@@ -114,6 +143,8 @@  fd_recv(struct stream *stream, void *buffer, size_t n)
 #endif
         if (error != EAGAIN) {
             VLOG_DBG_RL(&rl, "recv: %s", sock_strerror(error));
+        } else {
+            s->rx_ready = false;
         }
         return -error;
     }
@@ -127,9 +158,29 @@  fd_send(struct stream *stream, const void *buffer, size_t n)
     ssize_t retval;
     int error;
 
+    if (s->hint) {
+        /* poll-loop is providing us with hints for IO */
+        if (!s->tx_ready) {
+            if (!(s->hint->revents & OVS_POLLOUT)) {
+                return -EAGAIN;
+            } else {
+                /* POLLOUT event from poll loop, mark us as ready */
+                s->tx_ready = true;
+                s->hint->revents &= ~OVS_POLLOUT;
+            }
+        }
+    }
     retval = send(s->fd, buffer, n, 0);
     if (retval < 0) {
         error = sock_errno();
+#ifdef __linux__
+        /* Linux will sometimes return ENOBUFS on sockets instead of EAGAIN. Usually seen
+         *  on unix domain sockets 
+         */
+        if (error == ENOBUFS) {
+           error = EAGAIN;
+        }
+#endif
 #ifdef _WIN32
         if (error == WSAEWOULDBLOCK) {
            error = EAGAIN;
@@ -137,6 +188,8 @@  fd_send(struct stream *stream, const void *buffer, size_t n)
 #endif
         if (error != EAGAIN) {
             VLOG_DBG_RL(&rl, "send: %s", sock_strerror(error));
+        } else {
+            s->tx_ready = false;
         }
         return -error;
     }
@@ -150,11 +203,11 @@  fd_wait(struct stream *stream, enum stream_wait_type wait)
     switch (wait) {
     case STREAM_CONNECT:
     case STREAM_SEND:
-        poll_fd_wait(s->fd, OVS_POLLOUT);
+        private_poll_fd_wait(s->fd, OVS_POLLOUT);
         break;
 
     case STREAM_RECV:
-        poll_fd_wait(s->fd, OVS_POLLIN);
+        private_poll_fd_wait(s->fd, OVS_POLLIN);
         break;
 
     default:
@@ -223,6 +276,8 @@  new_fd_pstream(char *name, int fd,
     ps->accept_cb = accept_cb;
     ps->unlink_path = unlink_path;
     *pstreamp = &ps->pstream;
+    /* persistent registration */
+    poll_fd_register(ps->fd, OVS_POLLIN, NULL);
     return 0;
 }
 
@@ -230,6 +285,7 @@  static void
 pfd_close(struct pstream *pstream)
 {
     struct fd_pstream *ps = fd_pstream_cast(pstream);
+    poll_fd_deregister(ps->fd);
     closesocket(ps->fd);
     maybe_unlink_and_free(ps->unlink_path);
     free(ps);
@@ -271,7 +327,7 @@  static void
 pfd_wait(struct pstream *pstream)
 {
     struct fd_pstream *ps = fd_pstream_cast(pstream);
-    poll_fd_wait(ps->fd, OVS_POLLIN);
+    private_poll_fd_wait(ps->fd, OVS_POLLIN);
 }
 
 static const struct pstream_class fd_pstream_class = {
diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
index 3b7f9865e..53ae51c1b 100644
--- a/lib/stream-ssl.c
+++ b/lib/stream-ssl.c
@@ -147,6 +147,7 @@  struct ssl_stream
     /* A few bytes of header data in case SSL negotiation fails. */
     uint8_t head[2];
     short int n_head;
+    struct pollfd *hint;
 };
 
 /* SSL context created by ssl_init(). */
@@ -310,6 +311,8 @@  new_ssl_stream(char *name, char *server_name, int fd, enum session_type type,
         SSL_set_msg_callback_arg(ssl, sslv);
     }
 
+
+    poll_fd_register(sslv->fd, OVS_POLLIN, &sslv->hint);
     *streamp = &sslv->stream;
     free(server_name);
     return 0;
@@ -604,6 +607,7 @@  ssl_close(struct stream *stream)
     ERR_clear_error();
 
     SSL_free(sslv->ssl);
+    poll_fd_deregister(sslv->fd);
     closesocket(sslv->fd);
     free(sslv);
 }
@@ -697,6 +701,27 @@  ssl_recv(struct stream *stream, void *buffer, size_t n)
     /* Behavior of zero-byte SSL_read is poorly defined. */
     ovs_assert(n > 0);
 
+     if (sslv->hint) {
+        /* poll-loop is providing us with hints for IO. If we got a HUP/NVAL we skip straight
+         * to the read which should return 0 if the HUP is a real one, if not we clear it
+         * for all other cases we belive what (e)poll has fed us.
+         */
+        if ((!(sslv->hint->revents & (OVS_POLLHUP|OVS_POLLNVAL))) && (sslv->rx_want == SSL_READING)) {
+            if (!(sslv->hint->revents & OVS_POLLIN)) {
+                return -EAGAIN;
+            } else {
+                /* POLLIN event from poll loop, mark us as ready 
+                 * rx_want is cleared further down by reading ssl fsm
+                 */
+                sslv->hint->revents &= ~OVS_POLLIN;
+            }
+        } else {
+            sslv->hint->revents &= ~(OVS_POLLHUP|OVS_POLLNVAL);
+        }
+    }
+
+
+
     old_state = SSL_get_state(sslv->ssl);
     ret = SSL_read(sslv->ssl, buffer, n);
     if (old_state != SSL_get_state(sslv->ssl)) {
@@ -729,6 +754,19 @@  ssl_do_tx(struct stream *stream)
 {
     struct ssl_stream *sslv = ssl_stream_cast(stream);
 
+     if (sslv->hint) {
+        /* poll-loop is providing us with hints for IO */
+        if (sslv->tx_want == SSL_WRITING) {
+            if (!(sslv->hint->revents & OVS_POLLOUT)) {
+                return EAGAIN;
+            } else {
+                /* POLLIN event from poll loop, mark us as ready 
+                 * rx_want is cleared further down by reading ssl fsm
+                 */
+                sslv->hint->revents &= ~OVS_POLLOUT;
+            }
+        }
+    }
     for (;;) {
         int old_state = SSL_get_state(sslv->ssl);
         int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
@@ -771,6 +809,8 @@  ssl_send(struct stream *stream, const void *buffer, size_t n)
             ssl_clear_txbuf(sslv);
             return n;
         case EAGAIN:
+            /* we want to know when this fd will become available again */
+            stream_send_wait(stream);
             return n;
         default:
             ssl_clear_txbuf(sslv);
@@ -795,7 +835,7 @@  ssl_run_wait(struct stream *stream)
     struct ssl_stream *sslv = ssl_stream_cast(stream);
 
     if (sslv->tx_want != SSL_NOTHING) {
-        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
+        private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
     }
 }
 
@@ -811,13 +851,13 @@  ssl_wait(struct stream *stream, enum stream_wait_type wait)
         } else {
             switch (sslv->state) {
             case STATE_TCP_CONNECTING:
-                poll_fd_wait(sslv->fd, OVS_POLLOUT);
+                private_poll_fd_wait(sslv->fd, OVS_POLLOUT);
                 break;
 
             case STATE_SSL_CONNECTING:
                 /* ssl_connect() called SSL_accept() or SSL_connect(), which
                  * set up the status that we test here. */
-                poll_fd_wait(sslv->fd,
+                private_poll_fd_wait(sslv->fd,
                                want_to_poll_events(SSL_want(sslv->ssl)));
                 break;
 
@@ -829,7 +869,7 @@  ssl_wait(struct stream *stream, enum stream_wait_type wait)
 
     case STREAM_RECV:
         if (sslv->rx_want != SSL_NOTHING) {
-            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
+            private_poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
         } else {
             poll_immediate_wake();
         }
@@ -911,6 +951,7 @@  pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
                  ds_steal_cstr(&bound_name));
     pstream_set_bound_port(&pssl->pstream, htons(port));
     pssl->fd = fd;
+    poll_fd_register(fd, OVS_POLLIN, NULL);
     *pstreamp = &pssl->pstream;
 
     return 0;
@@ -920,6 +961,7 @@  static void
 pssl_close(struct pstream *pstream)
 {
     struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
+    poll_fd_deregister(pssl->fd);
     closesocket(pssl->fd);
     free(pssl);
 }
diff --git a/lib/timeval.c b/lib/timeval.c
index 193c7bab1..59a12414f 100644
--- a/lib/timeval.c
+++ b/lib/timeval.c
@@ -38,6 +38,7 @@ 
 #include "unixctl.h"
 #include "util.h"
 #include "openvswitch/vlog.h"
+#include "openvswitch/poll-loop.h"
 
 VLOG_DEFINE_THIS_MODULE(timeval);
 
@@ -369,6 +370,88 @@  time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED,
     return retval;
 }
 
+#ifdef OVS_USE_EPOLL
+
+/* Like epoll_wait(), except:
+ *
+ *      - The timeout is specified as an absolute time, as defined by
+ *        time_msec(), instead of a duration.
+ *
+ *      - On error, returns a negative error code (instead of setting errno).
+ *
+ *      - If interrupted by a signal, retries automatically until the original
+ *        timeout is reached.  (Because of this property, this function will
+ *        never return -EINTR.)
+ *
+ * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
+int
+time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+          long long int timeout_when, int *elapsed)
+{
+    long long int *last_wakeup = last_wakeup_get();
+    long long int start;
+    bool quiescent;
+    int retval = 0;
+
+    time_init();
+    coverage_clear();
+    coverage_run();
+    if (*last_wakeup && !thread_is_pmd()) {
+        log_poll_interval(*last_wakeup);
+    }
+    start = time_msec();
+
+    timeout_when = MIN(timeout_when, deadline);
+    quiescent = ovsrcu_is_quiescent();
+
+    for (;;) {
+        long long int now = time_msec();
+        int time_left;
+
+        if (now >= timeout_when) {
+            time_left = 0;
+        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
+            time_left = INT_MAX;
+        } else {
+            time_left = timeout_when - now;
+        }
+
+        if (!quiescent) {
+            if (!time_left) {
+                ovsrcu_quiesce();
+            } else {
+                ovsrcu_quiesce_start();
+            }
+        }
+
+        retval = epoll_wait(epoll_fd, events, max, time_left);
+        if (retval < 0) {
+            retval = -errno;
+        }
+
+        if (!quiescent && time_left) {
+            ovsrcu_quiesce_end();
+        }
+
+        if (deadline <= time_msec()) {
+            fatal_signal_handler(SIGALRM);
+            if (retval < 0) {
+                retval = 0;
+            }
+            break;
+        }
+
+        if (retval != -EINTR) {
+            break;
+        }
+    }
+    *last_wakeup = time_msec();
+    refresh_rusage();
+    *elapsed = *last_wakeup - start;
+    return retval;
+}
+#endif
+
 long long int
 timespec_to_msec(const struct timespec *ts)
 {
diff --git a/lib/timeval.h b/lib/timeval.h
index 502f703d4..347a09d63 100644
--- a/lib/timeval.h
+++ b/lib/timeval.h
@@ -20,6 +20,9 @@ 
 #include <time.h>
 #include "openvswitch/type-props.h"
 #include "util.h"
+#ifdef __linux__
+#include <sys/epoll.h>
+#endif
 
 #ifdef  __cplusplus
 extern "C" {
@@ -61,6 +64,10 @@  void time_wall_timespec(struct timespec *);
 void time_alarm(unsigned int secs);
 int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
               long long int timeout_when, int *elapsed);
+#ifdef __linux__
+int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+          long long int timeout_when, int *elapsed);
+#endif
 
 long long int timespec_to_msec(const struct timespec *);
 long long int timespec_to_usec(const struct timespec *);