Message ID | a4de1a1ad6c5e70fcc32db354fb8dbdd13b8cbd2.1618236421.git.grive@u256.net |
---|---|
State | Changes Requested |
Headers | show |
Series | dpif-netdev: Parallel offload processing | expand |
On 4/12/21 5:20 PM, Gaetan Rivet wrote: > Similar to what was done for the PMD threads [1], reduce the performance > impact of quiescing too often in the offload thread. > > After each processed offload, the offload thread currently quiesce and > will sync with RCU. This synchronization can be lengthy and make the > thread unnecessary slow. > > Instead attempt to quiesce every 10 ms at most. While the queue is > empty, the offload thread remains quiescent. > > [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval > in PMD main loop.") > > Signed-off-by: Gaetan Rivet <grive@u256.net> > Reviewed-by: Eli Britstein <elibr@nvidia.com> > --- > lib/dpif-netdev.c | 18 ++++++++++++++++-- > 1 file changed, 16 insertions(+), 2 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index d458bcb12..4e403a9e5 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -2740,15 +2740,20 @@ err_free: > return -1; > } > > +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */ > + > static void * > dp_netdev_flow_offload_main(void *data OVS_UNUSED) > { > struct dp_offload_thread_item *offload; > struct ovs_list *list; > long long int latency_us; > + long long int next_rcu; > + long long int now; > const char *op; > int ret; > > + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; > for (;;) { > ovs_mutex_lock(&dp_offload_thread.mutex); > if (ovs_list_is_empty(&dp_offload_thread.list)) { > @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > ovs_mutex_cond_wait(&dp_offload_thread.cond, > &dp_offload_thread.mutex); > ovsrcu_quiesce_end(); > + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; > } > list = ovs_list_pop_front(&dp_offload_thread.list); > dp_offload_thread.enqueued_item--; > @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > OVS_NOT_REACHED(); > } > > - latency_us = time_usec() - offload->timestamp; > + now = time_usec(); > + > + latency_us = now - offload->timestamp; > mov_avg_cma_update(&dp_offload_thread.cma, latency_us); > mov_avg_ema_update(&dp_offload_thread.ema, latency_us); > > @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > ret == 0 ? "succeed" : "failed", op, > UUID_ARGS((struct uuid *) &offload->flow->mega_ufid)); > dp_netdev_free_flow_offload(offload); > - ovsrcu_quiesce(); > + > + /* Do RCU synchronization at fixed interval. */ > + if (now > next_rcu) { > + if (!ovsrcu_try_quiesce()) { > + next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; Commit 81ac8b3b194c do it a bit differently, by using current time as a base for calculating the next deadline while you use current deadline (which is in the past) to calculate the next one. Under heavy load, won't it just end up doing the RCU synchronization at every it iteration, as next_rcu would always run behind? > + } > + } > } > > return NULL; >
On Wed, Apr 21, 2021, at 19:24, Maxime Coquelin wrote: > > > On 4/12/21 5:20 PM, Gaetan Rivet wrote: > > Similar to what was done for the PMD threads [1], reduce the performance > > impact of quiescing too often in the offload thread. > > > > After each processed offload, the offload thread currently quiesce and > > will sync with RCU. This synchronization can be lengthy and make the > > thread unnecessary slow. > > > > Instead attempt to quiesce every 10 ms at most. While the queue is > > empty, the offload thread remains quiescent. > > > > [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval > > in PMD main loop.") > > > > Signed-off-by: Gaetan Rivet <grive@u256.net> > > Reviewed-by: Eli Britstein <elibr@nvidia.com> > > --- > > lib/dpif-netdev.c | 18 ++++++++++++++++-- > > 1 file changed, 16 insertions(+), 2 deletions(-) > > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index d458bcb12..4e403a9e5 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -2740,15 +2740,20 @@ err_free: > > return -1; > > } > > > > +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */ > > + > > static void * > > dp_netdev_flow_offload_main(void *data OVS_UNUSED) > > { > > struct dp_offload_thread_item *offload; > > struct ovs_list *list; > > long long int latency_us; > > + long long int next_rcu; > > + long long int now; > > const char *op; > > int ret; > > > > + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; > > for (;;) { > > ovs_mutex_lock(&dp_offload_thread.mutex); > > if (ovs_list_is_empty(&dp_offload_thread.list)) { > > @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > > ovs_mutex_cond_wait(&dp_offload_thread.cond, > > &dp_offload_thread.mutex); > > ovsrcu_quiesce_end(); > > + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; > > } > > list = ovs_list_pop_front(&dp_offload_thread.list); > > dp_offload_thread.enqueued_item--; > > @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > > OVS_NOT_REACHED(); > > } > > > > - latency_us = time_usec() - offload->timestamp; > > + now = time_usec(); > > + > > + latency_us = now - offload->timestamp; > > mov_avg_cma_update(&dp_offload_thread.cma, latency_us); > > mov_avg_ema_update(&dp_offload_thread.ema, latency_us); > > > > @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) > > ret == 0 ? "succeed" : "failed", op, > > UUID_ARGS((struct uuid *) &offload->flow->mega_ufid)); > > dp_netdev_free_flow_offload(offload); > > - ovsrcu_quiesce(); > > + > > + /* Do RCU synchronization at fixed interval. */ > > + if (now > next_rcu) { > > + if (!ovsrcu_try_quiesce()) { > > + next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; > > Commit 81ac8b3b194c do it a bit differently, by using current time as a > base for calculating the next deadline while you use current deadline > (which is in the past) to calculate the next one. > > Under heavy load, won't it just end up doing the RCU synchronization at > every it iteration, as next_rcu would always run behind? > Hello Maxime, I think there is an issue, but I'm not sure it will be related to a specific load. The try-lock here will fail if the global seq locking fails. It would be extremely unlucky for multiple calls to fail, but it will be unrelated to a specific charge I believe. In the end seq locking happens for so many concurrency primitives that all threads are touching that pretty often. So even if we fail there, we will eventually succeed, and as the chance to succeed are always pretty high, even if we were lagging 10 ms behind schedule, we will eventually catch-up with a limited number of quiescing. So I've never had an issue with this, but still, I think it's badly written. My initial thought was that I wanted to force another attempt quickly in case this one failed, as the RCU should take priority over the offload thread. However, unlike the PMD thread, the offload thread is meant to yield, and waiting a little on the seq lock is fine. So I think instead of duck-taping a way to make sure we quiesce, I should just use ovsrcu_quiesce() and wait for the call to resume. Only the PMDs are meant to run-to-completion. So I propose to switch to ovsrcu_quiesce, and base next_rcu on 'now' instead of itself. But if anyone has another opinion feel free to let me know.
On 4/22/21 1:51 PM, Gaƫtan Rivet wrote: > On Wed, Apr 21, 2021, at 19:24, Maxime Coquelin wrote: >> >> >> On 4/12/21 5:20 PM, Gaetan Rivet wrote: >>> Similar to what was done for the PMD threads [1], reduce the performance >>> impact of quiescing too often in the offload thread. >>> >>> After each processed offload, the offload thread currently quiesce and >>> will sync with RCU. This synchronization can be lengthy and make the >>> thread unnecessary slow. >>> >>> Instead attempt to quiesce every 10 ms at most. While the queue is >>> empty, the offload thread remains quiescent. >>> >>> [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval >>> in PMD main loop.") >>> >>> Signed-off-by: Gaetan Rivet <grive@u256.net> >>> Reviewed-by: Eli Britstein <elibr@nvidia.com> >>> --- >>> lib/dpif-netdev.c | 18 ++++++++++++++++-- >>> 1 file changed, 16 insertions(+), 2 deletions(-) >>> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >>> index d458bcb12..4e403a9e5 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -2740,15 +2740,20 @@ err_free: >>> return -1; >>> } >>> >>> +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */ >>> + >>> static void * >>> dp_netdev_flow_offload_main(void *data OVS_UNUSED) >>> { >>> struct dp_offload_thread_item *offload; >>> struct ovs_list *list; >>> long long int latency_us; >>> + long long int next_rcu; >>> + long long int now; >>> const char *op; >>> int ret; >>> >>> + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; >>> for (;;) { >>> ovs_mutex_lock(&dp_offload_thread.mutex); >>> if (ovs_list_is_empty(&dp_offload_thread.list)) { >>> @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) >>> ovs_mutex_cond_wait(&dp_offload_thread.cond, >>> &dp_offload_thread.mutex); >>> ovsrcu_quiesce_end(); >>> + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; >>> } >>> list = ovs_list_pop_front(&dp_offload_thread.list); >>> dp_offload_thread.enqueued_item--; >>> @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) >>> OVS_NOT_REACHED(); >>> } >>> >>> - latency_us = time_usec() - offload->timestamp; >>> + now = time_usec(); >>> + >>> + latency_us = now - offload->timestamp; >>> mov_avg_cma_update(&dp_offload_thread.cma, latency_us); >>> mov_avg_ema_update(&dp_offload_thread.ema, latency_us); >>> >>> @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) >>> ret == 0 ? "succeed" : "failed", op, >>> UUID_ARGS((struct uuid *) &offload->flow->mega_ufid)); >>> dp_netdev_free_flow_offload(offload); >>> - ovsrcu_quiesce(); >>> + >>> + /* Do RCU synchronization at fixed interval. */ >>> + if (now > next_rcu) { >>> + if (!ovsrcu_try_quiesce()) { >>> + next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; >> >> Commit 81ac8b3b194c do it a bit differently, by using current time as a >> base for calculating the next deadline while you use current deadline >> (which is in the past) to calculate the next one. >> >> Under heavy load, won't it just end up doing the RCU synchronization at >> every it iteration, as next_rcu would always run behind? >> > > Hello Maxime, > > I think there is an issue, but I'm not sure it will be related to a specific load. > The try-lock here will fail if the global seq locking fails. It would be extremely unlucky for multiple calls > to fail, but it will be unrelated to a specific charge I believe. In the end seq locking happens for so many > concurrency primitives that all threads are touching that pretty often. > > So even if we fail there, we will eventually succeed, and as the chance to succeed are always pretty high, > even if we were lagging 10 ms behind schedule, we will eventually catch-up with a limited number of quiescing. > > So I've never had an issue with this, but still, I think it's badly written. > My initial thought was that I wanted to force another attempt quickly in case this one failed, as the RCU should take priority > over the offload thread. > However, unlike the PMD thread, the offload thread is meant to yield, and waiting a little on the seq lock is fine. > So I think instead of duck-taping a way to make sure we quiesce, I should just use ovsrcu_quiesce() and wait for > the call to resume. Only the PMDs are meant to run-to-completion. Thanks for the detailed analysis. > So I propose to switch to ovsrcu_quiesce, and base next_rcu on 'now' instead of itself. > But if anyone has another opinion feel free to let me know. > I think your proposal makes sense. Maxime
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index d458bcb12..4e403a9e5 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -2740,15 +2740,20 @@ err_free: return -1; } +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */ + static void * dp_netdev_flow_offload_main(void *data OVS_UNUSED) { struct dp_offload_thread_item *offload; struct ovs_list *list; long long int latency_us; + long long int next_rcu; + long long int now; const char *op; int ret; + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; for (;;) { ovs_mutex_lock(&dp_offload_thread.mutex); if (ovs_list_is_empty(&dp_offload_thread.list)) { @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) ovs_mutex_cond_wait(&dp_offload_thread.cond, &dp_offload_thread.mutex); ovsrcu_quiesce_end(); + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; } list = ovs_list_pop_front(&dp_offload_thread.list); dp_offload_thread.enqueued_item--; @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) OVS_NOT_REACHED(); } - latency_us = time_usec() - offload->timestamp; + now = time_usec(); + + latency_us = now - offload->timestamp; mov_avg_cma_update(&dp_offload_thread.cma, latency_us); mov_avg_ema_update(&dp_offload_thread.ema, latency_us); @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) ret == 0 ? "succeed" : "failed", op, UUID_ARGS((struct uuid *) &offload->flow->mega_ufid)); dp_netdev_free_flow_offload(offload); - ovsrcu_quiesce(); + + /* Do RCU synchronization at fixed interval. */ + if (now > next_rcu) { + if (!ovsrcu_try_quiesce()) { + next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; + } + } } return NULL;