Message ID | 20200908172748.10761-1-mark.d.gray@redhat.com |
---|---|
State | Changes Requested |
Headers | show |
Series | [ovs-dev,v7] dpif-netlink: distribute polling to discreet handlers | expand |
Hi Mark, Found one issue, and then since another spin is needed I am suggesting a stylish improvement. On Tue, Sep 08, 2020 at 06:27:48PM +0100, Mark Gray wrote: > From: Aaron Conole <aconole@redhat.com> > > Currently, the channel handlers are polled globally. On some > systems, this causes a thundering herd issue where multiple > handler threads become active, only to do no work and immediately > sleep. > > The approach here is to push the netlink socket channels to discreet > handler threads to process, rather than polling on every thread. > This will eliminate the need to wake multiple threads. > > To check: > > ip netns add left > ip netns add right > ip link add center-left type veth peer name left0 netns left > ip link add center-right type veth peer name right0 netns right > ip link set center-left up > ip link set center-right up > ip -n left ip link set left0 up > ip -n left ip addr add 172.31.110.10/24 dev left0 > ip -n right ip link set right0 up > ip -n right ip addr add 172.31.110.11/24 dev right0 > > ovs-vsctl add-br br0 > ovs-vsctl add-port br0 center-right > ovs-vsctl add-port br0 center-left > > # in one terminal > perf record -e sched:sched_wakeup,irq:softirq_entry -ag > > # in a separate terminal > ip netns exec left arping -I left0 -c 1 172.31.110.11 > > # in the perf terminal after exiting > perf script > > Look for the number of 'handler' threads which were made active. > > Suggested-by: Ben Pfaff <blp@ovn.org> > Co-authored-by: Mark Gray <mark.d.gray@redhat.com> > Reported-by: David Ahern <dsahern@gmail.com> > Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html > Cc: Matteo Croce <technoboy85@gmail.com> > Cc: Flavio Leitner <fbl@sysclose.org> > Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") > Signed-off-by: Aaron Conole <aconole@redhat.com> > Signed-off-by: Mark Gray <mark.d.gray@redhat.com> > --- > > > v2: Oops - forgot to commit my whitespace cleanups. > v3: one port latency results as per Matteo's comments > > Stock: > min/avg/max/mdev = 21.5/36.5/96.5/1.0 us > With Patch: > min/avg/max/mdev = 5.3/9.7/98.4/0.5 us > v4: Oops - first email did not send > v5: min/avg/max/mdev = 9.3/10.4/33.6/2.2 us > v6: Split out the AUTHORS patch and added a cover letter as > per Ilya's suggestion. > Fixed 0-day issues. > v7: Merged patches as per Flavio's suggestion. This is > no longer a series. Fixed some other small issues. > > lib/dpif-netlink.c | 325 +++++++++++++++++++++++++-------------------- > lib/id-pool.c | 10 ++ > lib/id-pool.h | 1 + > 3 files changed, 195 insertions(+), 141 deletions(-) > > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > index 7da4fb54d..4ecf0c51c 100644 > --- a/lib/dpif-netlink.c > +++ b/lib/dpif-netlink.c > @@ -37,6 +37,7 @@ > #include "dpif-provider.h" > #include "fat-rwlock.h" > #include "flow.h" > +#include "id-pool.h" > #include "netdev-linux.h" > #include "netdev-offload.h" > #include "netdev-provider.h" > @@ -160,6 +161,9 @@ static void dpif_netlink_flow_to_dpif_flow(struct dpif_flow *, > > /* One of the dpif channels between the kernel and userspace. */ > struct dpif_channel { > + struct hmap_node hmap_node; > + struct dpif_handler *handler; > + uint32_t port_idx; > struct nl_sock *sock; /* Netlink socket. */ > long long int last_poll; /* Last time this channel was polled. */ > }; > @@ -183,6 +187,8 @@ struct dpif_handler { > int n_events; /* Num events returned by epoll_wait(). */ > int event_offset; /* Offset into 'epoll_events'. */ > > + struct hmap channels; /* map of channels for each port. */ > + > #ifdef _WIN32 > /* Pool of sockets. */ > struct dpif_windows_vport_sock *vport_sock_pool; > @@ -201,9 +207,8 @@ struct dpif_netlink { > struct fat_rwlock upcall_lock; > struct dpif_handler *handlers; > uint32_t n_handlers; /* Num of upcall handlers. */ > - struct dpif_channel *channels; /* Array of channels for each port. */ > - int uc_array_size; /* Size of 'handler->channels' and */ > - /* 'handler->epoll_events'. */ > + > + struct id_pool *port_ids; /* Set of added port ids */ > > /* Change notification. */ > struct nl_sock *port_notifier; /* vport multicast group subscriber. */ > @@ -287,6 +292,44 @@ close_nl_sock(struct nl_sock *sock) > #endif > } > > +static size_t > +dpif_handler_channels_count(const struct dpif_handler *handler) > +{ > + return hmap_count(&handler->channels); > +} > + > +static struct dpif_channel * > +dpif_handler_channel_find(struct dpif_handler *handler, uint32_t idx) > +{ > + struct dpif_channel *channel; > + > + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0), > + &handler->channels) { > + if (channel->port_idx == idx) { > + return channel; > + } > + } > + > + return NULL; > +} > + > +static void > +dpif_channel_del(struct dpif_netlink *dpif, > + struct dpif_channel *channel) > +{ > + struct dpif_handler *handler = channel->handler; > + > +#ifndef _WIN32 > + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, > + nl_sock_fd(channel->sock), NULL); > + nl_sock_destroy(channel->sock); > +#endif > + hmap_remove(&handler->channels, &channel->hmap_node); > + id_pool_free_id(dpif->port_ids, channel->port_idx); > + handler->event_offset = handler->n_events = 0; > + free(channel); > +} > + > static struct dpif_netlink * > dpif_netlink_cast(const struct dpif *dpif) > { > @@ -385,6 +428,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp) > > dpif->dp_ifindex = dp->dp_ifindex; > dpif->user_features = dp->user_features; > + dpif->port_ids = id_pool_create(0, MAX_PORTS); > *dpifp = &dpif->dpif; > > return 0; > @@ -452,161 +496,145 @@ static bool > vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, > uint32_t *upcall_pid) > { > - /* Since the nl_sock can only be assigned in either all > - * or none "dpif" channels, the following check > - * would suffice. */ > - if (!dpif->channels[port_idx].sock) { > + size_t i; > + > + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); > + > + if (!id_pool_has_id(dpif->port_ids, port_idx)) { > return false; > } > - ovs_assert(!WINDOWS || dpif->n_handlers <= 1); > > - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); > + /* The 'port_idx' should only be valid for a single handler. */ > + for (i = 0; i < dpif->n_handlers; i++) { > + struct dpif_channel *channel; > + > + channel = dpif_handler_channel_find(&dpif->handlers[i], port_idx); > + if (channel) { > + *upcall_pid = nl_sock_pid(channel->sock); > + return true; > + } > + } > > - return true; > + return false; > } > > static int > vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, > struct nl_sock *sock) > { > - struct epoll_event event; > uint32_t port_idx = odp_to_u32(port_no); > + struct dpif_channel *channel; > + struct dpif_handler *handler; > + struct epoll_event event; > size_t i; > - int error; > > - if (dpif->handlers == NULL) { > + if (dpif->handlers == NULL || > + id_pool_has_id(dpif->port_ids, port_idx)) { > close_nl_sock(sock); > - return 0; > + return EINVAL; > } > > - /* We assume that the datapath densely chooses port numbers, which can > - * therefore be used as an index into 'channels' and 'epoll_events' of > - * 'dpif'. */ > - if (port_idx >= dpif->uc_array_size) { > - uint32_t new_size = port_idx + 1; > - > - if (new_size > MAX_PORTS) { > - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", > - dpif_name(&dpif->dpif), port_no); > - return EFBIG; > - } > + if (port_idx >= MAX_PORTS) { > + VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", > + dpif_name(&dpif->dpif), port_no); > + return EFBIG; > + } > > - dpif->channels = xrealloc(dpif->channels, > - new_size * sizeof *dpif->channels); > + handler = &dpif->handlers[0]; > > - for (i = dpif->uc_array_size; i < new_size; i++) { > - dpif->channels[i].sock = NULL; > + for (i = 0; i < dpif->n_handlers; i++) { > + if (dpif_handler_channels_count(&dpif->handlers[i]) < > + dpif_handler_channels_count(handler)) { > + handler = &dpif->handlers[i]; > } > + } > > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > + channel = xmalloc(sizeof *channel); > + channel->port_idx = port_idx; > + channel->sock = sock; > + channel->last_poll = LLONG_MIN; > + channel->handler = handler; > + hmap_insert(&handler->channels, &channel->hmap_node, > + hash_int(port_idx, 0)); > > - handler->epoll_events = xrealloc(handler->epoll_events, > - new_size * sizeof *handler->epoll_events); > - > - } > - dpif->uc_array_size = new_size; > - } > + handler->epoll_events = xrealloc(handler->epoll_events, > + dpif_handler_channels_count(handler) * > + sizeof *handler->epoll_events); > > memset(&event, 0, sizeof event); > event.events = EPOLLIN | EPOLLEXCLUSIVE; > event.data.u32 = port_idx; > > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > - > #ifndef _WIN32 > - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > - &event) < 0) { > - error = errno; > - goto error; > - } > -#endif > - } > - dpif->channels[port_idx].sock = sock; > - dpif->channels[port_idx].last_poll = LLONG_MIN; > - > - return 0; > - > -error: > -#ifndef _WIN32 > - while (i--) { > - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, > - nl_sock_fd(sock), NULL); > + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > + &event) < 0) { > + hmap_remove(&handler->channels, &channel->hmap_node); > + free(channel); > + id_pool_free_id(dpif->port_ids, port_idx); > + return errno; > } > #endif > - dpif->channels[port_idx].sock = NULL; > > - return error; > + id_pool_add(dpif->port_ids, port_idx); > + return 0; > } > > -static void > -vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) > +static bool > +vport_del_channel(struct dpif_netlink *dpif, odp_port_t port_no) > { > - uint32_t port_idx = odp_to_u32(port_no); > + struct dpif_channel *channel = NULL; > + bool found_channel = false; > size_t i; > > - if (!dpif->handlers || port_idx >= dpif->uc_array_size > - || !dpif->channels[port_idx].sock) { > - return; > - } > - > for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > -#ifndef _WIN32 > - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, > - nl_sock_fd(dpif->channels[port_idx].sock), NULL); > -#endif > - handler->event_offset = handler->n_events = 0; > + channel = dpif_handler_channel_find(&dpif->handlers[i], > + odp_to_u32(port_no)); > + if (channel) { > + dpif_channel_del(dpif, channel); > + found_channel = true; > + break; > + } My suggestion is to move channel declaration there to inside the loop without initialization and take out the found_channel like it is done at vport_get_pid(): for (i = 0; i < dpif->n_handlers; i++) { struct dpif_channel *channel; channel = dpif_handler_channel_find(&dpif->handlers[i], odp_to_u32(port_no)); if (channel) { dpif_channel_del(dpif, channel); return true; } } return false; > } > -#ifndef _WIN32 > - nl_sock_destroy(dpif->channels[port_idx].sock); > -#endif > - dpif->channels[port_idx].sock = NULL; > + > + return found_channel; > } > > static void > destroy_all_channels(struct dpif_netlink *dpif) > OVS_REQ_WRLOCK(dpif->upcall_lock) > { [...] > @@ -2475,25 +2520,15 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) > } > } > > - if (port_no < keep_channels_nbits) { > - bitmap_set1(keep_channels, port_no); > - } > continue; > > error: > - vport_del_channels(dpif, vport.port_no); > + vport_del_channel(dpif, port_no); The port_no here is uint32_t but vport_del_channel() receives odp_port_t. That breaks build: https://travis-ci.org/github/fleitner/ovs/jobs/725370855 Thanks!
On 09/09/2020 14:49, Flavio Leitner wrote: > > > Hi Mark, > > Found one issue, and then since another spin is needed I am > suggesting a stylish improvement. > > > On Tue, Sep 08, 2020 at 06:27:48PM +0100, Mark Gray wrote: >> From: Aaron Conole <aconole@redhat.com> >> >> Currently, the channel handlers are polled globally. On some >> systems, this causes a thundering herd issue where multiple >> handler threads become active, only to do no work and immediately >> sleep. >> >> The approach here is to push the netlink socket channels to discreet >> handler threads to process, rather than polling on every thread. >> This will eliminate the need to wake multiple threads. >> >> To check: >> >> ip netns add left >> ip netns add right >> ip link add center-left type veth peer name left0 netns left >> ip link add center-right type veth peer name right0 netns right >> ip link set center-left up >> ip link set center-right up >> ip -n left ip link set left0 up >> ip -n left ip addr add 172.31.110.10/24 dev left0 >> ip -n right ip link set right0 up >> ip -n right ip addr add 172.31.110.11/24 dev right0 >> >> ovs-vsctl add-br br0 >> ovs-vsctl add-port br0 center-right >> ovs-vsctl add-port br0 center-left >> >> # in one terminal >> perf record -e sched:sched_wakeup,irq:softirq_entry -ag >> >> # in a separate terminal >> ip netns exec left arping -I left0 -c 1 172.31.110.11 >> >> # in the perf terminal after exiting >> perf script >> >> Look for the number of 'handler' threads which were made active. >> >> Suggested-by: Ben Pfaff <blp@ovn.org> >> Co-authored-by: Mark Gray <mark.d.gray@redhat.com> >> Reported-by: David Ahern <dsahern@gmail.com> >> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html >> Cc: Matteo Croce <technoboy85@gmail.com> >> Cc: Flavio Leitner <fbl@sysclose.org> >> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") >> Signed-off-by: Aaron Conole <aconole@redhat.com> >> Signed-off-by: Mark Gray <mark.d.gray@redhat.com> >> --- >> >> >> v2: Oops - forgot to commit my whitespace cleanups. >> v3: one port latency results as per Matteo's comments >> >> Stock: >> min/avg/max/mdev = 21.5/36.5/96.5/1.0 us >> With Patch: >> min/avg/max/mdev = 5.3/9.7/98.4/0.5 us >> v4: Oops - first email did not send >> v5: min/avg/max/mdev = 9.3/10.4/33.6/2.2 us >> v6: Split out the AUTHORS patch and added a cover letter as >> per Ilya's suggestion. >> Fixed 0-day issues. >> v7: Merged patches as per Flavio's suggestion. This is >> no longer a series. Fixed some other small issues. >> >> lib/dpif-netlink.c | 325 +++++++++++++++++++++++++-------------------- >> lib/id-pool.c | 10 ++ >> lib/id-pool.h | 1 + >> 3 files changed, 195 insertions(+), 141 deletions(-) >> >> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c >> index 7da4fb54d..4ecf0c51c 100644 >> --- a/lib/dpif-netlink.c >> +++ b/lib/dpif-netlink.c >> @@ -37,6 +37,7 @@ >> #include "dpif-provider.h" >> #include "fat-rwlock.h" >> #include "flow.h" >> +#include "id-pool.h" >> #include "netdev-linux.h" >> #include "netdev-offload.h" >> #include "netdev-provider.h" >> @@ -160,6 +161,9 @@ static void dpif_netlink_flow_to_dpif_flow(struct dpif_flow *, >> >> /* One of the dpif channels between the kernel and userspace. */ >> struct dpif_channel { >> + struct hmap_node hmap_node; >> + struct dpif_handler *handler; >> + uint32_t port_idx; >> struct nl_sock *sock; /* Netlink socket. */ >> long long int last_poll; /* Last time this channel was polled. */ >> }; >> @@ -183,6 +187,8 @@ struct dpif_handler { >> int n_events; /* Num events returned by epoll_wait(). */ >> int event_offset; /* Offset into 'epoll_events'. */ >> >> + struct hmap channels; /* map of channels for each port. */ >> + >> #ifdef _WIN32 >> /* Pool of sockets. */ >> struct dpif_windows_vport_sock *vport_sock_pool; >> @@ -201,9 +207,8 @@ struct dpif_netlink { >> struct fat_rwlock upcall_lock; >> struct dpif_handler *handlers; >> uint32_t n_handlers; /* Num of upcall handlers. */ >> - struct dpif_channel *channels; /* Array of channels for each port. */ >> - int uc_array_size; /* Size of 'handler->channels' and */ >> - /* 'handler->epoll_events'. */ >> + >> + struct id_pool *port_ids; /* Set of added port ids */ >> >> /* Change notification. */ >> struct nl_sock *port_notifier; /* vport multicast group subscriber. */ >> @@ -287,6 +292,44 @@ close_nl_sock(struct nl_sock *sock) >> #endif >> } >> >> +static size_t >> +dpif_handler_channels_count(const struct dpif_handler *handler) >> +{ >> + return hmap_count(&handler->channels); >> +} >> + >> +static struct dpif_channel * >> +dpif_handler_channel_find(struct dpif_handler *handler, uint32_t idx) >> +{ >> + struct dpif_channel *channel; >> + >> + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0), >> + &handler->channels) { >> + if (channel->port_idx == idx) { >> + return channel; >> + } >> + } >> + >> + return NULL; >> +} >> + >> +static void >> +dpif_channel_del(struct dpif_netlink *dpif, >> + struct dpif_channel *channel) >> +{ >> + struct dpif_handler *handler = channel->handler; >> + >> +#ifndef _WIN32 >> + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, >> + nl_sock_fd(channel->sock), NULL); >> + nl_sock_destroy(channel->sock); >> +#endif >> + hmap_remove(&handler->channels, &channel->hmap_node); >> + id_pool_free_id(dpif->port_ids, channel->port_idx); >> + handler->event_offset = handler->n_events = 0; >> + free(channel); >> +} >> + >> static struct dpif_netlink * >> dpif_netlink_cast(const struct dpif *dpif) >> { >> @@ -385,6 +428,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp) >> >> dpif->dp_ifindex = dp->dp_ifindex; >> dpif->user_features = dp->user_features; >> + dpif->port_ids = id_pool_create(0, MAX_PORTS); >> *dpifp = &dpif->dpif; >> >> return 0; >> @@ -452,161 +496,145 @@ static bool >> vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, >> uint32_t *upcall_pid) >> { >> - /* Since the nl_sock can only be assigned in either all >> - * or none "dpif" channels, the following check >> - * would suffice. */ >> - if (!dpif->channels[port_idx].sock) { >> + size_t i; >> + >> + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); >> + >> + if (!id_pool_has_id(dpif->port_ids, port_idx)) { >> return false; >> } >> - ovs_assert(!WINDOWS || dpif->n_handlers <= 1); >> >> - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); >> + /* The 'port_idx' should only be valid for a single handler. */ >> + for (i = 0; i < dpif->n_handlers; i++) { >> + struct dpif_channel *channel; >> + >> + channel = dpif_handler_channel_find(&dpif->handlers[i], port_idx); >> + if (channel) { >> + *upcall_pid = nl_sock_pid(channel->sock); >> + return true; >> + } >> + } >> >> - return true; >> + return false; >> } >> >> static int >> vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, >> struct nl_sock *sock) >> { >> - struct epoll_event event; >> uint32_t port_idx = odp_to_u32(port_no); >> + struct dpif_channel *channel; >> + struct dpif_handler *handler; >> + struct epoll_event event; >> size_t i; >> - int error; >> >> - if (dpif->handlers == NULL) { >> + if (dpif->handlers == NULL || >> + id_pool_has_id(dpif->port_ids, port_idx)) { >> close_nl_sock(sock); >> - return 0; >> + return EINVAL; >> } >> >> - /* We assume that the datapath densely chooses port numbers, which can >> - * therefore be used as an index into 'channels' and 'epoll_events' of >> - * 'dpif'. */ >> - if (port_idx >= dpif->uc_array_size) { >> - uint32_t new_size = port_idx + 1; >> - >> - if (new_size > MAX_PORTS) { >> - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", >> - dpif_name(&dpif->dpif), port_no); >> - return EFBIG; >> - } >> + if (port_idx >= MAX_PORTS) { >> + VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", >> + dpif_name(&dpif->dpif), port_no); >> + return EFBIG; >> + } >> >> - dpif->channels = xrealloc(dpif->channels, >> - new_size * sizeof *dpif->channels); >> + handler = &dpif->handlers[0]; >> >> - for (i = dpif->uc_array_size; i < new_size; i++) { >> - dpif->channels[i].sock = NULL; >> + for (i = 0; i < dpif->n_handlers; i++) { >> + if (dpif_handler_channels_count(&dpif->handlers[i]) < >> + dpif_handler_channels_count(handler)) { >> + handler = &dpif->handlers[i]; >> } >> + } >> >> - for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> + channel = xmalloc(sizeof *channel); >> + channel->port_idx = port_idx; >> + channel->sock = sock; >> + channel->last_poll = LLONG_MIN; >> + channel->handler = handler; >> + hmap_insert(&handler->channels, &channel->hmap_node, >> + hash_int(port_idx, 0)); >> >> - handler->epoll_events = xrealloc(handler->epoll_events, >> - new_size * sizeof *handler->epoll_events); >> - >> - } >> - dpif->uc_array_size = new_size; >> - } >> + handler->epoll_events = xrealloc(handler->epoll_events, >> + dpif_handler_channels_count(handler) * >> + sizeof *handler->epoll_events); >> >> memset(&event, 0, sizeof event); >> event.events = EPOLLIN | EPOLLEXCLUSIVE; >> event.data.u32 = port_idx; >> >> - for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> - >> #ifndef _WIN32 >> - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), >> - &event) < 0) { >> - error = errno; >> - goto error; >> - } >> -#endif >> - } >> - dpif->channels[port_idx].sock = sock; >> - dpif->channels[port_idx].last_poll = LLONG_MIN; >> - >> - return 0; >> - >> -error: >> -#ifndef _WIN32 >> - while (i--) { >> - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, >> - nl_sock_fd(sock), NULL); >> + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), >> + &event) < 0) { >> + hmap_remove(&handler->channels, &channel->hmap_node); >> + free(channel); >> + id_pool_free_id(dpif->port_ids, port_idx); >> + return errno; >> } >> #endif >> - dpif->channels[port_idx].sock = NULL; >> >> - return error; >> + id_pool_add(dpif->port_ids, port_idx); >> + return 0; >> } >> >> -static void >> -vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) >> +static bool >> +vport_del_channel(struct dpif_netlink *dpif, odp_port_t port_no) >> { >> - uint32_t port_idx = odp_to_u32(port_no); >> + struct dpif_channel *channel = NULL; >> + bool found_channel = false; >> size_t i; >> >> - if (!dpif->handlers || port_idx >= dpif->uc_array_size >> - || !dpif->channels[port_idx].sock) { >> - return; >> - } >> - >> for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> -#ifndef _WIN32 >> - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, >> - nl_sock_fd(dpif->channels[port_idx].sock), NULL); >> -#endif >> - handler->event_offset = handler->n_events = 0; >> + channel = dpif_handler_channel_find(&dpif->handlers[i], >> + odp_to_u32(port_no)); >> + if (channel) { >> + dpif_channel_del(dpif, channel); >> + found_channel = true; >> + break; >> + } > > My suggestion is to move channel declaration there to inside the loop > without initialization and take out the found_channel like it is > done at vport_get_pid(): > > for (i = 0; i < dpif->n_handlers; i++) { > struct dpif_channel *channel; > > channel = dpif_handler_channel_find(&dpif->handlers[i], > odp_to_u32(port_no)); > if (channel) { > dpif_channel_del(dpif, channel); > return true; > } > } > > return false; Makes sense, I dont like returning successfully mid-function but probably makes it a bit easier to read. > > >> } >> -#ifndef _WIN32 >> - nl_sock_destroy(dpif->channels[port_idx].sock); >> -#endif >> - dpif->channels[port_idx].sock = NULL; >> + >> + return found_channel; >> } >> >> static void >> destroy_all_channels(struct dpif_netlink *dpif) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { > [...] >> @@ -2475,25 +2520,15 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) >> } >> } >> >> - if (port_no < keep_channels_nbits) { >> - bitmap_set1(keep_channels, port_no); >> - } >> continue; >> >> error: >> - vport_del_channels(dpif, vport.port_no); >> + vport_del_channel(dpif, port_no); > > The port_no here is uint32_t but vport_del_channel() receives odp_port_t. > That breaks build: > https://travis-ci.org/github/fleitner/ovs/jobs/725370855 I see you set this up on your own github account. Good idea. > > Thanks! >
On 09/09/2020 14:49, Flavio Leitner wrote: > >> error: >> - vport_del_channels(dpif, vport.port_no); >> + vport_del_channel(dpif, port_no); > > The port_no here is uint32_t but vport_del_channel() receives odp_port_t. > That breaks build: > https://travis-ci.org/github/fleitner/ovs/jobs/725370855 > I think I'll rename this variable to port_idx. The convention in the file is port_no is usually of type odp_port_t. This contributed to me missing it. > Thanks! >
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 7da4fb54d..4ecf0c51c 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -37,6 +37,7 @@ #include "dpif-provider.h" #include "fat-rwlock.h" #include "flow.h" +#include "id-pool.h" #include "netdev-linux.h" #include "netdev-offload.h" #include "netdev-provider.h" @@ -160,6 +161,9 @@ static void dpif_netlink_flow_to_dpif_flow(struct dpif_flow *, /* One of the dpif channels between the kernel and userspace. */ struct dpif_channel { + struct hmap_node hmap_node; + struct dpif_handler *handler; + uint32_t port_idx; struct nl_sock *sock; /* Netlink socket. */ long long int last_poll; /* Last time this channel was polled. */ }; @@ -183,6 +187,8 @@ struct dpif_handler { int n_events; /* Num events returned by epoll_wait(). */ int event_offset; /* Offset into 'epoll_events'. */ + struct hmap channels; /* map of channels for each port. */ + #ifdef _WIN32 /* Pool of sockets. */ struct dpif_windows_vport_sock *vport_sock_pool; @@ -201,9 +207,8 @@ struct dpif_netlink { struct fat_rwlock upcall_lock; struct dpif_handler *handlers; uint32_t n_handlers; /* Num of upcall handlers. */ - struct dpif_channel *channels; /* Array of channels for each port. */ - int uc_array_size; /* Size of 'handler->channels' and */ - /* 'handler->epoll_events'. */ + + struct id_pool *port_ids; /* Set of added port ids */ /* Change notification. */ struct nl_sock *port_notifier; /* vport multicast group subscriber. */ @@ -287,6 +292,44 @@ close_nl_sock(struct nl_sock *sock) #endif } +static size_t +dpif_handler_channels_count(const struct dpif_handler *handler) +{ + return hmap_count(&handler->channels); +} + +static struct dpif_channel * +dpif_handler_channel_find(struct dpif_handler *handler, uint32_t idx) +{ + struct dpif_channel *channel; + + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0), + &handler->channels) { + if (channel->port_idx == idx) { + return channel; + } + } + + return NULL; +} + +static void +dpif_channel_del(struct dpif_netlink *dpif, + struct dpif_channel *channel) +{ + struct dpif_handler *handler = channel->handler; + +#ifndef _WIN32 + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, + nl_sock_fd(channel->sock), NULL); + nl_sock_destroy(channel->sock); +#endif + hmap_remove(&handler->channels, &channel->hmap_node); + id_pool_free_id(dpif->port_ids, channel->port_idx); + handler->event_offset = handler->n_events = 0; + free(channel); +} + static struct dpif_netlink * dpif_netlink_cast(const struct dpif *dpif) { @@ -385,6 +428,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp) dpif->dp_ifindex = dp->dp_ifindex; dpif->user_features = dp->user_features; + dpif->port_ids = id_pool_create(0, MAX_PORTS); *dpifp = &dpif->dpif; return 0; @@ -452,161 +496,145 @@ static bool vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, uint32_t *upcall_pid) { - /* Since the nl_sock can only be assigned in either all - * or none "dpif" channels, the following check - * would suffice. */ - if (!dpif->channels[port_idx].sock) { + size_t i; + + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); + + if (!id_pool_has_id(dpif->port_ids, port_idx)) { return false; } - ovs_assert(!WINDOWS || dpif->n_handlers <= 1); - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); + /* The 'port_idx' should only be valid for a single handler. */ + for (i = 0; i < dpif->n_handlers; i++) { + struct dpif_channel *channel; + + channel = dpif_handler_channel_find(&dpif->handlers[i], port_idx); + if (channel) { + *upcall_pid = nl_sock_pid(channel->sock); + return true; + } + } - return true; + return false; } static int vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, struct nl_sock *sock) { - struct epoll_event event; uint32_t port_idx = odp_to_u32(port_no); + struct dpif_channel *channel; + struct dpif_handler *handler; + struct epoll_event event; size_t i; - int error; - if (dpif->handlers == NULL) { + if (dpif->handlers == NULL || + id_pool_has_id(dpif->port_ids, port_idx)) { close_nl_sock(sock); - return 0; + return EINVAL; } - /* We assume that the datapath densely chooses port numbers, which can - * therefore be used as an index into 'channels' and 'epoll_events' of - * 'dpif'. */ - if (port_idx >= dpif->uc_array_size) { - uint32_t new_size = port_idx + 1; - - if (new_size > MAX_PORTS) { - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", - dpif_name(&dpif->dpif), port_no); - return EFBIG; - } + if (port_idx >= MAX_PORTS) { + VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", + dpif_name(&dpif->dpif), port_no); + return EFBIG; + } - dpif->channels = xrealloc(dpif->channels, - new_size * sizeof *dpif->channels); + handler = &dpif->handlers[0]; - for (i = dpif->uc_array_size; i < new_size; i++) { - dpif->channels[i].sock = NULL; + for (i = 0; i < dpif->n_handlers; i++) { + if (dpif_handler_channels_count(&dpif->handlers[i]) < + dpif_handler_channels_count(handler)) { + handler = &dpif->handlers[i]; } + } - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; + channel = xmalloc(sizeof *channel); + channel->port_idx = port_idx; + channel->sock = sock; + channel->last_poll = LLONG_MIN; + channel->handler = handler; + hmap_insert(&handler->channels, &channel->hmap_node, + hash_int(port_idx, 0)); - handler->epoll_events = xrealloc(handler->epoll_events, - new_size * sizeof *handler->epoll_events); - - } - dpif->uc_array_size = new_size; - } + handler->epoll_events = xrealloc(handler->epoll_events, + dpif_handler_channels_count(handler) * + sizeof *handler->epoll_events); memset(&event, 0, sizeof event); event.events = EPOLLIN | EPOLLEXCLUSIVE; event.data.u32 = port_idx; - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; - #ifndef _WIN32 - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), - &event) < 0) { - error = errno; - goto error; - } -#endif - } - dpif->channels[port_idx].sock = sock; - dpif->channels[port_idx].last_poll = LLONG_MIN; - - return 0; - -error: -#ifndef _WIN32 - while (i--) { - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, - nl_sock_fd(sock), NULL); + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), + &event) < 0) { + hmap_remove(&handler->channels, &channel->hmap_node); + free(channel); + id_pool_free_id(dpif->port_ids, port_idx); + return errno; } #endif - dpif->channels[port_idx].sock = NULL; - return error; + id_pool_add(dpif->port_ids, port_idx); + return 0; } -static void -vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) +static bool +vport_del_channel(struct dpif_netlink *dpif, odp_port_t port_no) { - uint32_t port_idx = odp_to_u32(port_no); + struct dpif_channel *channel = NULL; + bool found_channel = false; size_t i; - if (!dpif->handlers || port_idx >= dpif->uc_array_size - || !dpif->channels[port_idx].sock) { - return; - } - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; -#ifndef _WIN32 - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, - nl_sock_fd(dpif->channels[port_idx].sock), NULL); -#endif - handler->event_offset = handler->n_events = 0; + channel = dpif_handler_channel_find(&dpif->handlers[i], + odp_to_u32(port_no)); + if (channel) { + dpif_channel_del(dpif, channel); + found_channel = true; + break; + } } -#ifndef _WIN32 - nl_sock_destroy(dpif->channels[port_idx].sock); -#endif - dpif->channels[port_idx].sock = NULL; + + return found_channel; } static void destroy_all_channels(struct dpif_netlink *dpif) OVS_REQ_WRLOCK(dpif->upcall_lock) { - unsigned int i; + size_t i; if (!dpif->handlers) { return; } - for (i = 0; i < dpif->uc_array_size; i++ ) { - struct dpif_netlink_vport vport_request; - uint32_t upcall_pids = 0; - - if (!dpif->channels[i].sock) { - continue; - } - - /* Turn off upcalls. */ - dpif_netlink_vport_init(&vport_request); - vport_request.cmd = OVS_VPORT_CMD_SET; - vport_request.dp_ifindex = dpif->dp_ifindex; - vport_request.port_no = u32_to_odp(i); - vport_request.n_upcall_pids = 1; - vport_request.upcall_pids = &upcall_pids; - dpif_netlink_vport_transact(&vport_request, NULL, NULL); - - vport_del_channels(dpif, u32_to_odp(i)); - } - for (i = 0; i < dpif->n_handlers; i++) { struct dpif_handler *handler = &dpif->handlers[i]; + struct dpif_channel *channel, *next; + + HMAP_FOR_EACH_SAFE (channel, next, hmap_node, &handler->channels) { + struct dpif_netlink_vport vport_request; + uint32_t upcall_pids = 0; + /* Turn off upcalls. */ + dpif_netlink_vport_init(&vport_request); + vport_request.cmd = OVS_VPORT_CMD_SET; + vport_request.dp_ifindex = dpif->dp_ifindex; + vport_request.port_no = u32_to_odp(channel->port_idx); + vport_request.n_upcall_pids = 1; + vport_request.upcall_pids = &upcall_pids; + dpif_netlink_vport_transact(&vport_request, NULL, NULL); + ovs_assert(channel != NULL); + dpif_channel_del(dpif, channel); + } dpif_netlink_handler_uninit(handler); free(handler->epoll_events); } - free(dpif->channels); + free(dpif->handlers); dpif->handlers = NULL; - dpif->channels = NULL; dpif->n_handlers = 0; - dpif->uc_array_size = 0; } static void @@ -618,9 +646,11 @@ dpif_netlink_close(struct dpif *dpif_) fat_rwlock_wrlock(&dpif->upcall_lock); destroy_all_channels(dpif); + id_pool_destroy(dpif->port_ids); fat_rwlock_unlock(&dpif->upcall_lock); fat_rwlock_destroy(&dpif->upcall_lock); + free(dpif); } @@ -1003,7 +1033,9 @@ dpif_netlink_port_del__(struct dpif_netlink *dpif, odp_port_t port_no) #endif error = dpif_netlink_vport_transact(&vport, NULL, NULL); - vport_del_channels(dpif, port_no); + if (!vport_del_channel(dpif, port_no)) { + return EINVAL; + } if (!error && !ovs_tunnels_out_of_tree) { error = dpif_netlink_rtnl_port_destroy(dpif_port.name, dpif_port.type); @@ -1084,23 +1116,39 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, odp_port_t port_no) OVS_REQ_RDLOCK(dpif->upcall_lock) { + struct dpif_channel *min_channel = NULL, *found_channel = NULL; uint32_t port_idx = odp_to_u32(port_no); uint32_t pid = 0; + size_t i; + + /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s + * channel, since it is not heavily loaded. */ + uint32_t idx = port_no == ODPP_NONE ? 0 : port_idx; + + if (!id_pool_has_id(dpif->port_ids, port_idx)) { + return 0; + } - if (dpif->handlers && dpif->uc_array_size > 0) { - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s - * channel, since it is not heavily loaded. */ - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; - - /* Needs to check in case the socket pointer is changed in between - * the holding of upcall_lock. A known case happens when the main - * thread deletes the vport while the handler thread is handling - * the upcall from that port. */ - if (dpif->channels[idx].sock) { - pid = nl_sock_pid(dpif->channels[idx].sock); + if (dpif->handlers) { + for (i = 0; i < dpif->n_handlers; i++) { + if (dpif_handler_channel_find(&dpif->handlers[i], idx)) { + found_channel = dpif_handler_channel_find(&dpif->handlers[i], + idx); + break; + } + + if (dpif_handler_channel_find(&dpif->handlers[i], 0)) { + min_channel = dpif_handler_channel_find(&dpif->handlers[i], 0); + } } } + if (found_channel) { + pid = nl_sock_pid(found_channel->sock); + } else if (min_channel) { + pid = nl_sock_pid(min_channel->sock); + } + return pid; } @@ -2342,12 +2390,14 @@ dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops, static void dpif_netlink_handler_uninit(struct dpif_handler *handler) { + hmap_destroy(&handler->channels); vport_delete_sock_pool(handler); } static int dpif_netlink_handler_init(struct dpif_handler *handler) { + hmap_init(&handler->channels); return vport_create_sock_pool(handler); } #else @@ -2355,6 +2405,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler) static int dpif_netlink_handler_init(struct dpif_handler *handler) { + hmap_init(&handler->channels); handler->epoll_fd = epoll_create(10); return handler->epoll_fd < 0 ? errno : 0; } @@ -2362,6 +2413,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler) static void dpif_netlink_handler_uninit(struct dpif_handler *handler) { + hmap_destroy(&handler->channels); close(handler->epoll_fd); } #endif @@ -2374,9 +2426,7 @@ static int dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) OVS_REQ_WRLOCK(dpif->upcall_lock) { - unsigned long int *keep_channels; struct dpif_netlink_vport vport; - size_t keep_channels_nbits; struct nl_dump dump; uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; struct ofpbuf buf; @@ -2412,13 +2462,9 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) for (i = 0; i < n_handlers; i++) { struct dpif_handler *handler = &dpif->handlers[i]; - handler->event_offset = handler->n_events = 0; } - keep_channels_nbits = dpif->uc_array_size; - keep_channels = bitmap_allocate(keep_channels_nbits); - ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); dpif_netlink_port_dump_start__(dpif, &dump); while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) { @@ -2426,8 +2472,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) uint32_t upcall_pid; int error; - if (port_no >= dpif->uc_array_size - || !vport_get_pid(dpif, port_no, &upcall_pid)) { + if (!vport_get_pid(dpif, port_no, &upcall_pid)) { struct nl_sock *sock; error = create_nl_sock(dpif, &sock); @@ -2475,25 +2520,15 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) } } - if (port_no < keep_channels_nbits) { - bitmap_set1(keep_channels, port_no); - } continue; error: - vport_del_channels(dpif, vport.port_no); + vport_del_channel(dpif, port_no); } + nl_dump_done(&dump); ofpbuf_uninit(&buf); - /* Discard any saved channels that we didn't reuse. */ - for (i = 0; i < keep_channels_nbits; i++) { - if (!bitmap_is_set(keep_channels, i)) { - vport_del_channels(dpif, u32_to_odp(i)); - } - } - free(keep_channels); - return retval; } @@ -2714,6 +2749,8 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, OVS_REQ_RDLOCK(dpif->upcall_lock) { struct dpif_handler *handler; + size_t n_channels; + int read_tries = 0; if (!dpif->handlers || handler_id >= dpif->n_handlers) { @@ -2721,14 +2758,15 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, } handler = &dpif->handlers[handler_id]; - if (handler->event_offset >= handler->n_events) { + n_channels = dpif_handler_channels_count(handler); + if (handler->event_offset >= handler->n_events && n_channels) { int retval; handler->event_offset = handler->n_events = 0; do { retval = epoll_wait(handler->epoll_fd, handler->epoll_events, - dpif->uc_array_size, 0); + n_channels, 0); } while (retval < 0 && errno == EINTR); if (retval < 0) { @@ -2741,7 +2779,10 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, while (handler->event_offset < handler->n_events) { int idx = handler->epoll_events[handler->event_offset].data.u32; - struct dpif_channel *ch = &dpif->channels[idx]; + struct dpif_channel *ch = dpif_handler_channel_find(handler, idx); + if (!ch) { + return EAGAIN; + } handler->event_offset++; @@ -2845,12 +2886,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) if (dpif->handlers) { size_t i; - if (!dpif->channels[0].sock) { - return; - } - for (i = 0; i < dpif->uc_array_size; i++ ) { + for (i = 0; i < dpif->n_handlers; i++) { + struct dpif_handler *handler = &dpif->handlers[i]; + struct dpif_channel *channel; + + HMAP_FOR_EACH (channel, hmap_node, &handler->channels) { + nl_sock_drain(channel->sock); + } - nl_sock_drain(dpif->channels[i].sock); } } } diff --git a/lib/id-pool.c b/lib/id-pool.c index 69910ad08..bef822f6b 100644 --- a/lib/id-pool.c +++ b/lib/id-pool.c @@ -93,6 +93,16 @@ id_pool_find(struct id_pool *pool, uint32_t id) return NULL; } +bool +id_pool_has_id(struct id_pool *pool, uint32_t id) +{ + if (!id_pool_find(pool, id)) { + return false; + } + + return true; +} + void id_pool_add(struct id_pool *pool, uint32_t id) { diff --git a/lib/id-pool.h b/lib/id-pool.h index 8721f8793..62876e2a5 100644 --- a/lib/id-pool.h +++ b/lib/id-pool.h @@ -29,6 +29,7 @@ void id_pool_destroy(struct id_pool *); bool id_pool_alloc_id(struct id_pool *, uint32_t *id); void id_pool_free_id(struct id_pool *, uint32_t id); void id_pool_add(struct id_pool *, uint32_t id); +bool id_pool_has_id(struct id_pool *, uint32_t id); /* * ID pool.