diff mbox series

[ovs-dev,1/1] ofproto-dpif-upcall: Avoid stale ukeys leaks.

Message ID 20240523104629.2936785-1-roid@nvidia.com
State Under Review, archived
Delegated to: Eelco Chaudron
Headers show
Series [ovs-dev,1/1] ofproto-dpif-upcall: Avoid stale ukeys leaks. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Roi Dayan May 23, 2024, 10:46 a.m. UTC
It is observed in some environments that there are much more ukeys than
actual DP flows. For example:

$ ovs-appctl upcall/show
system@ovs-system:
flows : (current 7) (avg 6) (max 117) (limit 2125)
offloaded flows : 525
dump duration : 1063ms
ufid enabled : true

23: (keys 3612)
24: (keys 3625)
25: (keys 3485)

The revalidator threads are busy revalidating the stale ukeys leading to
high CPU and long dump duration.

There are some possible situations that may result in stale ukeys that
have no corresponding DP flows.

In revalidator, push_dp_ops() doesn't check error if the op type is not
DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
case, where the old flow is deleted first and then the new one is
created. If the creation fails, the ukey will be stale (no corresponding
DP flow). This patch adds a warning in such case.

Another possible scenario is in handle_upcalls() if a PUT operation did
not succeed and op->error attribute was not set correctly it can lead to
stale ukey in operational state.

This patch adds checks in the sweep phase for such ukeys and move them
to DELETE so that they can be cleared eventually.

Co-authored-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Roi Dayan <roid@nvidia.com>
---
 ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

Comments

Mike Pattrick May 29, 2024, 2:35 p.m. UTC | #1
On Thu, May 23, 2024 at 6:47 AM Roi Dayan via dev
<ovs-dev@openvswitch.org> wrote:
>
> It is observed in some environments that there are much more ukeys than
> actual DP flows. For example:
>
> $ ovs-appctl upcall/show
> system@ovs-system:
> flows : (current 7) (avg 6) (max 117) (limit 2125)
> offloaded flows : 525
> dump duration : 1063ms
> ufid enabled : true
>
> 23: (keys 3612)
> 24: (keys 3625)
> 25: (keys 3485)
>
> The revalidator threads are busy revalidating the stale ukeys leading to
> high CPU and long dump duration.
>
> There are some possible situations that may result in stale ukeys that
> have no corresponding DP flows.
>
> In revalidator, push_dp_ops() doesn't check error if the op type is not
> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
> case, where the old flow is deleted first and then the new one is
> created. If the creation fails, the ukey will be stale (no corresponding
> DP flow). This patch adds a warning in such case.
>
> Another possible scenario is in handle_upcalls() if a PUT operation did
> not succeed and op->error attribute was not set correctly it can lead to
> stale ukey in operational state.
>
> This patch adds checks in the sweep phase for such ukeys and move them
> to DELETE so that they can be cleared eventually.
>
> Co-authored-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Roi Dayan <roid@nvidia.com>
> ---
>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
>
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index 83609ec62b63..e9520ebdf910 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>  COVERAGE_DEFINE(dumped_new_flow);
>  COVERAGE_DEFINE(handler_duplicate_upcall);
>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>  COVERAGE_DEFINE(ukey_dp_change);
>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>  COVERAGE_DEFINE(ukey_replace_contention);
> @@ -278,6 +279,7 @@ enum flow_del_reason {
>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
> +    FDR_FLOW_STALE,         /* Flow stale detected. */
>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>      FDR_NO_OFPROTO,         /* Bridge not found. */
>      FDR_PURGE,              /* User requested flow deletion. */
> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
>
>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>              /* Only deleted flows need their stats pushed. */
> +            if (op->dop.error) {
> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
> +                             "%p", op->dop.error, op->dop.type, op->ukey);
> +            }
>              continue;
>          }
>
> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>                  } else if (!seq_mismatch) {
>                      result = UKEY_KEEP;
> +                } else if (!ukey->stats.used &&

Would it be possible for stats.used to be set but the dp flow to be
deleted? For example, if a flow is offloaded to TC, but something
external to OVS clears it?


Thanks,
Mike

> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
> +                           ofproto_max_idle) {
> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
> +                                 "%p delta %llus", ukey,
> +                                 udpif_flow_time_delta(udpif, ukey));
> +                    result = UKEY_DELETE;
> +                    del_reason = FDR_FLOW_STALE;
>                  } else {
>                      struct dpif_flow_stats stats;
>                      COVERAGE_INC(revalidate_missed_dp_flow);
> --
> 2.21.0
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Roi Dayan May 30, 2024, 6:21 a.m. UTC | #2
On 29/05/2024 17:35, Mike Pattrick wrote:
> On Thu, May 23, 2024 at 6:47 AM Roi Dayan via dev
> <ovs-dev@openvswitch.org> wrote:
>>
>> It is observed in some environments that there are much more ukeys than
>> actual DP flows. For example:
>>
>> $ ovs-appctl upcall/show
>> system@ovs-system:
>> flows : (current 7) (avg 6) (max 117) (limit 2125)
>> offloaded flows : 525
>> dump duration : 1063ms
>> ufid enabled : true
>>
>> 23: (keys 3612)
>> 24: (keys 3625)
>> 25: (keys 3485)
>>
>> The revalidator threads are busy revalidating the stale ukeys leading to
>> high CPU and long dump duration.
>>
>> There are some possible situations that may result in stale ukeys that
>> have no corresponding DP flows.
>>
>> In revalidator, push_dp_ops() doesn't check error if the op type is not
>> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
>> case, where the old flow is deleted first and then the new one is
>> created. If the creation fails, the ukey will be stale (no corresponding
>> DP flow). This patch adds a warning in such case.
>>
>> Another possible scenario is in handle_upcalls() if a PUT operation did
>> not succeed and op->error attribute was not set correctly it can lead to
>> stale ukey in operational state.
>>
>> This patch adds checks in the sweep phase for such ukeys and move them
>> to DELETE so that they can be cleared eventually.
>>
>> Co-authored-by: Han Zhou <hzhou@ovn.org>
>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>> Signed-off-by: Roi Dayan <roid@nvidia.com>
>> ---
>>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>>  1 file changed, 15 insertions(+)
>>
>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>> index 83609ec62b63..e9520ebdf910 100644
>> --- a/ofproto/ofproto-dpif-upcall.c
>> +++ b/ofproto/ofproto-dpif-upcall.c
>> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>>  COVERAGE_DEFINE(dumped_new_flow);
>>  COVERAGE_DEFINE(handler_duplicate_upcall);
>>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
>> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>>  COVERAGE_DEFINE(ukey_dp_change);
>>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>>  COVERAGE_DEFINE(ukey_replace_contention);
>> @@ -278,6 +279,7 @@ enum flow_del_reason {
>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>> +    FDR_FLOW_STALE,         /* Flow stale detected. */
>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>      FDR_PURGE,              /* User requested flow deletion. */
>> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
>>
>>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>>              /* Only deleted flows need their stats pushed. */
>> +            if (op->dop.error) {
>> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
>> +                             "%p", op->dop.error, op->dop.type, op->ukey);
>> +            }
>>              continue;
>>          }
>>
>> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>>                  } else if (!seq_mismatch) {
>>                      result = UKEY_KEEP;
>> +                } else if (!ukey->stats.used &&
> 
> Would it be possible for stats.used to be set but the dp flow to be
> deleted? For example, if a flow is offloaded to TC, but something
> external to OVS clears it?
> 
> 
> Thanks,
> Mike

Hi Mike, 

Yes it could be and in this situation the revalidator checking
the tc dump to check the rules will cause a readd of that rule.

Thanks,
Roi


> 
>> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
>> +                           ofproto_max_idle) {
>> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
>> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
>> +                                 "%p delta %llus", ukey,
>> +                                 udpif_flow_time_delta(udpif, ukey));
>> +                    result = UKEY_DELETE;
>> +                    del_reason = FDR_FLOW_STALE;
>>                  } else {
>>                      struct dpif_flow_stats stats;
>>                      COVERAGE_INC(revalidate_missed_dp_flow);
>> --
>> 2.21.0
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>
Eelco Chaudron May 30, 2024, 3:48 p.m. UTC | #3
On 23 May 2024, at 12:46, Roi Dayan via dev wrote:

> It is observed in some environments that there are much more ukeys than
> actual DP flows. For example:
>
> $ ovs-appctl upcall/show
> system@ovs-system:
> flows : (current 7) (avg 6) (max 117) (limit 2125)
> offloaded flows : 525
> dump duration : 1063ms
> ufid enabled : true
>
> 23: (keys 3612)
> 24: (keys 3625)
> 25: (keys 3485)
>
> The revalidator threads are busy revalidating the stale ukeys leading to
> high CPU and long dump duration.
>
> There are some possible situations that may result in stale ukeys that
> have no corresponding DP flows.
>
> In revalidator, push_dp_ops() doesn't check error if the op type is not
> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
> case, where the old flow is deleted first and then the new one is
> created. If the creation fails, the ukey will be stale (no corresponding
> DP flow). This patch adds a warning in such case.

Not sure I understand, this behavior should be captured by the UKEY_INCONSISTENT state.

> Another possible scenario is in handle_upcalls() if a PUT operation did
> not succeed and op->error attribute was not set correctly it can lead to
> stale ukey in operational state.


Guess we need to figure out these cases, as these are the root cause of your problem.

>
> This patch adds checks in the sweep phase for such ukeys and move them
> to DELETE so that they can be cleared eventually.
>
> Co-authored-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> Signed-off-by: Roi Dayan <roid@nvidia.com>
> ---
>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
>
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index 83609ec62b63..e9520ebdf910 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>  COVERAGE_DEFINE(dumped_new_flow);
>  COVERAGE_DEFINE(handler_duplicate_upcall);
>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>  COVERAGE_DEFINE(ukey_dp_change);
>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>  COVERAGE_DEFINE(ukey_replace_contention);
> @@ -278,6 +279,7 @@ enum flow_del_reason {
>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
> +    FDR_FLOW_STALE,         /* Flow stale detected. */

Please also update the associated script, see above comment.

>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>      FDR_NO_OFPROTO,         /* Bridge not found. */
>      FDR_PURGE,              /* User requested flow deletion. */
> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)


The (op->dop.error) condition below will never be reached, as it’s explicitly checked above. So the error message will never happen. The condition you explain is already covered by UKEY_INCONSISTENT. Not sure if we need a log message for this either at warn level, maybe debug, especially as you say it can happen often.

>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>              /* Only deleted flows need their stats pushed. */
> +            if (op->dop.error) {
> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
> +                             "%p", op->dop.error, op->dop.type, op->ukey);
> +            }
>              continue;
>          }
>
> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>                  } else if (!seq_mismatch) {
>                      result = UKEY_KEEP;
> +                } else if (!ukey->stats.used &&
> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
> +                           ofproto_max_idle) {

In theory, this is a missed dp flow, i.e. the same as the else {} case below.
Maybe we should change revalidate_ukey() to have a test for this?


Maybe something like this (not tested):

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index e9520ebdf..c34978fc9 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -279,7 +279,7 @@ enum flow_del_reason {
     FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
     FDR_FLOW_IDLE,          /* Flow idle timeout. */
     FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
-    FDR_FLOW_STALE,         /* Flow stale detected. */
+    FDR_FLOW_NO_STATS_IDLE, /* Flow idled out without receiving statistics. */
     FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
     FDR_NO_OFPROTO,         /* Bridge not found. */
     FDR_PURGE,              /* User requested flow deletion. */
@@ -2452,7 +2452,14 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
         log_unexpected_stats_jump(ukey, stats);
     }

-    if (need_revalidate) {
+    if (!ukey->stats.used
+        && ukey->created < udpif->dpif->current_ms - ofproto_max_idle) {
+        /* If the flow has a 'used' value of 0, meaning no stats were received
+         * for this flow, and the configured idle time has elapsed, it might
+         * indicates a stale flow (i.e., a flow without an installed datapath
+         * rule). In this case, it is safe to mark this ukey for deletion. */
+        *del_reason = FDR_FLOW_NO_STATS_IDLE;
+    } else if (need_revalidate) {
         if (should_revalidate(udpif, ukey, push.n_packets)) {
             if (!ukey->xcache) {
                 ukey->xcache = xlate_cache_new();


Others, thoughts?

> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
> +                                 "%p delta %llus", ukey,
> +                                 udpif_flow_time_delta(udpif, ukey));
> +                    result = UKEY_DELETE;
> +                    del_reason = FDR_FLOW_STALE;
>                  } else {
>                      struct dpif_flow_stats stats;
>                      COVERAGE_INC(revalidate_missed_dp_flow);
> -- 
> 2.21.0
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Roi Dayan June 3, 2024, 7:18 a.m. UTC | #4
On 30/05/2024 18:48, Eelco Chaudron wrote:
> 
> 
> On 23 May 2024, at 12:46, Roi Dayan via dev wrote:
> 
>> It is observed in some environments that there are much more ukeys than
>> actual DP flows. For example:
>>
>> $ ovs-appctl upcall/show
>> system@ovs-system:
>> flows : (current 7) (avg 6) (max 117) (limit 2125)
>> offloaded flows : 525
>> dump duration : 1063ms
>> ufid enabled : true
>>
>> 23: (keys 3612)
>> 24: (keys 3625)
>> 25: (keys 3485)
>>
>> The revalidator threads are busy revalidating the stale ukeys leading to
>> high CPU and long dump duration.
>>
>> There are some possible situations that may result in stale ukeys that
>> have no corresponding DP flows.
>>
>> In revalidator, push_dp_ops() doesn't check error if the op type is not
>> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
>> case, where the old flow is deleted first and then the new one is
>> created. If the creation fails, the ukey will be stale (no corresponding
>> DP flow). This patch adds a warning in such case.
> 
> Not sure I understand, this behavior should be captured by the UKEY_INCONSISTENT state.

Hi Eelco,

thanks for reviewing.
We started the patch on older branch as we didn't rebase for some time
and got a little later on sending it.
I see the addition now for UKEY_INCOSISTENT in the following patch:

cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.

Looks like it handles the same situation we tried to handle in this patch.

> 
>> Another possible scenario is in handle_upcalls() if a PUT operation did
>> not succeed and op->error attribute was not set correctly it can lead to
>> stale ukey in operational state.
> 
> 
> Guess we need to figure out these cases, as these are the root cause of your problem.

right. maybe. but this could help keep the system alive/clean while logging the
bad situation instead of having increase in those stale/inconsistent ukeys.
I understand if it's not nice to handle it like this.

> 
>>
>> This patch adds checks in the sweep phase for such ukeys and move them
>> to DELETE so that they can be cleared eventually.
>>
>> Co-authored-by: Han Zhou <hzhou@ovn.org>
>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>> Signed-off-by: Roi Dayan <roid@nvidia.com>
>> ---
>>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>>  1 file changed, 15 insertions(+)
>>
>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>> index 83609ec62b63..e9520ebdf910 100644
>> --- a/ofproto/ofproto-dpif-upcall.c
>> +++ b/ofproto/ofproto-dpif-upcall.c
>> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>>  COVERAGE_DEFINE(dumped_new_flow);
>>  COVERAGE_DEFINE(handler_duplicate_upcall);
>>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
>> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>>  COVERAGE_DEFINE(ukey_dp_change);
>>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>>  COVERAGE_DEFINE(ukey_replace_contention);
>> @@ -278,6 +279,7 @@ enum flow_del_reason {
>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>> +    FDR_FLOW_STALE,         /* Flow stale detected. */
> 
> Please also update the associated script, see above comment.
> 
>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>      FDR_PURGE,              /* User requested flow deletion. */
>> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
> 
> 
> The (op->dop.error) condition below will never be reached, as it’s explicitly checked above. So the error message will never happen. The condition you explain is already covered by UKEY_INCONSISTENT. Not sure if we need a log message for this either at warn level, maybe debug, especially as you say it can happen often.
> 

right. as replied above. I didn't notice this when I did the cherry-pick as git didn't
show me a conflict in this area at all.
This commit was done before the change so we did hit this spot. we got little off
checking the commit and sending it on time for review. sorry about that.
cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.

I think we should redo our checks with the commit adding the UKEY_INCONSISTENT
state and see our result again.

thanks for the review.


>>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>>              /* Only deleted flows need their stats pushed. */
>> +            if (op->dop.error) {
>> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
>> +                             "%p", op->dop.error, op->dop.type, op->ukey);
>> +            }
>>              continue;
>>          }
>>
>> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>>                  } else if (!seq_mismatch) {
>>                      result = UKEY_KEEP;
>> +                } else if (!ukey->stats.used &&
>> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
>> +                           ofproto_max_idle) {
> 
> In theory, this is a missed dp flow, i.e. the same as the else {} case below.
> Maybe we should change revalidate_ukey() to have a test for this?
> 
> 
> Maybe something like this (not tested):
> 
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index e9520ebdf..c34978fc9 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -279,7 +279,7 @@ enum flow_del_reason {
>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
> -    FDR_FLOW_STALE,         /* Flow stale detected. */
> +    FDR_FLOW_NO_STATS_IDLE, /* Flow idled out without receiving statistics. */
>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>      FDR_NO_OFPROTO,         /* Bridge not found. */
>      FDR_PURGE,              /* User requested flow deletion. */
> @@ -2452,7 +2452,14 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
>          log_unexpected_stats_jump(ukey, stats);
>      }
> 
> -    if (need_revalidate) {
> +    if (!ukey->stats.used
> +        && ukey->created < udpif->dpif->current_ms - ofproto_max_idle) {
> +        /* If the flow has a 'used' value of 0, meaning no stats were received
> +         * for this flow, and the configured idle time has elapsed, it might
> +         * indicates a stale flow (i.e., a flow without an installed datapath
> +         * rule). In this case, it is safe to mark this ukey for deletion. */
> +        *del_reason = FDR_FLOW_NO_STATS_IDLE;
> +    } else if (need_revalidate) {
>          if (should_revalidate(udpif, ukey, push.n_packets)) {
>              if (!ukey->xcache) {
>                  ukey->xcache = xlate_cache_new();
> 
> 
> Others, thoughts?
> 
>> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
>> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
>> +                                 "%p delta %llus", ukey,
>> +                                 udpif_flow_time_delta(udpif, ukey));
>> +                    result = UKEY_DELETE;
>> +                    del_reason = FDR_FLOW_STALE;
>>                  } else {
>>                      struct dpif_flow_stats stats;
>>                      COVERAGE_INC(revalidate_missed_dp_flow);
>> -- 
>> 2.21.0
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Roi Dayan June 3, 2024, 8:07 a.m. UTC | #5
On 03/06/2024 10:18, Roi Dayan wrote:
> 
> 
> On 30/05/2024 18:48, Eelco Chaudron wrote:
>>
>>
>> On 23 May 2024, at 12:46, Roi Dayan via dev wrote:
>>
>>> It is observed in some environments that there are much more ukeys than
>>> actual DP flows. For example:
>>>
>>> $ ovs-appctl upcall/show
>>> system@ovs-system:
>>> flows : (current 7) (avg 6) (max 117) (limit 2125)
>>> offloaded flows : 525
>>> dump duration : 1063ms
>>> ufid enabled : true
>>>
>>> 23: (keys 3612)
>>> 24: (keys 3625)
>>> 25: (keys 3485)
>>>
>>> The revalidator threads are busy revalidating the stale ukeys leading to
>>> high CPU and long dump duration.
>>>
>>> There are some possible situations that may result in stale ukeys that
>>> have no corresponding DP flows.
>>>
>>> In revalidator, push_dp_ops() doesn't check error if the op type is not
>>> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
>>> case, where the old flow is deleted first and then the new one is
>>> created. If the creation fails, the ukey will be stale (no corresponding
>>> DP flow). This patch adds a warning in such case.
>>
>> Not sure I understand, this behavior should be captured by the UKEY_INCONSISTENT state.
> 
> Hi Eelco,
> 
> thanks for reviewing.
> We started the patch on older branch as we didn't rebase for some time
> and got a little later on sending it.
> I see the addition now for UKEY_INCOSISTENT in the following patch:
> 
> cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.
> 
> Looks like it handles the same situation we tried to handle in this patch.
> 
>>
>>> Another possible scenario is in handle_upcalls() if a PUT operation did
>>> not succeed and op->error attribute was not set correctly it can lead to
>>> stale ukey in operational state.
>>
>>
>> Guess we need to figure out these cases, as these are the root cause of your problem.
> 
> right. maybe. but this could help keep the system alive/clean while logging the
> bad situation instead of having increase in those stale/inconsistent ukeys.
> I understand if it's not nice to handle it like this.
> 

Hi Eelco,

I remember now one of the reproduction scenarios we did is do some traffic
to get rules added using TC and then delete those from tc command line
and it got to stale ukeys.
The revalidator dump thread not seeing the rules so not updating anything
and sweep over the ukeys skipping them as well.
This is why we checked against the timing stats of the ukey.

I remember I tested on the upstream code and not our development branch
and it reproduces. I didn't notice if the commit adding UKEY_INCONSISTENT
existed but it handles errors from adding flows so I dont think it matters.

I'll try to check and verify again but I think it's still here.
So while cases getting dop.error handled with UKEY_INCONSISTENT,
this case I think is not.

Thanks,
Roi

>>
>>>
>>> This patch adds checks in the sweep phase for such ukeys and move them
>>> to DELETE so that they can be cleared eventually.
>>>
>>> Co-authored-by: Han Zhou <hzhou@ovn.org>
>>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>>> Signed-off-by: Roi Dayan <roid@nvidia.com>
>>> ---
>>>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>>>  1 file changed, 15 insertions(+)
>>>
>>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>>> index 83609ec62b63..e9520ebdf910 100644
>>> --- a/ofproto/ofproto-dpif-upcall.c
>>> +++ b/ofproto/ofproto-dpif-upcall.c
>>> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>>>  COVERAGE_DEFINE(dumped_new_flow);
>>>  COVERAGE_DEFINE(handler_duplicate_upcall);
>>>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
>>> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>>>  COVERAGE_DEFINE(ukey_dp_change);
>>>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>>>  COVERAGE_DEFINE(ukey_replace_contention);
>>> @@ -278,6 +279,7 @@ enum flow_del_reason {
>>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>>> +    FDR_FLOW_STALE,         /* Flow stale detected. */
>>
>> Please also update the associated script, see above comment.
>>
>>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>>      FDR_PURGE,              /* User requested flow deletion. */
>>> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
>>
>>
>> The (op->dop.error) condition below will never be reached, as it’s explicitly checked above. So the error message will never happen. The condition you explain is already covered by UKEY_INCONSISTENT. Not sure if we need a log message for this either at warn level, maybe debug, especially as you say it can happen often.
>>
> 
> right. as replied above. I didn't notice this when I did the cherry-pick as git didn't
> show me a conflict in this area at all.
> This commit was done before the change so we did hit this spot. we got little off
> checking the commit and sending it on time for review. sorry about that.
> cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.
> 
> I think we should redo our checks with the commit adding the UKEY_INCONSISTENT
> state and see our result again.
> 
> thanks for the review.
> 
> 
>>>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>>>              /* Only deleted flows need their stats pushed. */
>>> +            if (op->dop.error) {
>>> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
>>> +                             "%p", op->dop.error, op->dop.type, op->ukey);
>>> +            }
>>>              continue;
>>>          }
>>>
>>> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>>>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>>>                  } else if (!seq_mismatch) {
>>>                      result = UKEY_KEEP;
>>> +                } else if (!ukey->stats.used &&
>>> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
>>> +                           ofproto_max_idle) {
>>
>> In theory, this is a missed dp flow, i.e. the same as the else {} case below.
>> Maybe we should change revalidate_ukey() to have a test for this?
>>
>>
>> Maybe something like this (not tested):
>>
>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>> index e9520ebdf..c34978fc9 100644
>> --- a/ofproto/ofproto-dpif-upcall.c
>> +++ b/ofproto/ofproto-dpif-upcall.c
>> @@ -279,7 +279,7 @@ enum flow_del_reason {
>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>> -    FDR_FLOW_STALE,         /* Flow stale detected. */
>> +    FDR_FLOW_NO_STATS_IDLE, /* Flow idled out without receiving statistics. */
>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>      FDR_PURGE,              /* User requested flow deletion. */
>> @@ -2452,7 +2452,14 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
>>          log_unexpected_stats_jump(ukey, stats);
>>      }
>>
>> -    if (need_revalidate) {
>> +    if (!ukey->stats.used
>> +        && ukey->created < udpif->dpif->current_ms - ofproto_max_idle) {
>> +        /* If the flow has a 'used' value of 0, meaning no stats were received
>> +         * for this flow, and the configured idle time has elapsed, it might
>> +         * indicates a stale flow (i.e., a flow without an installed datapath
>> +         * rule). In this case, it is safe to mark this ukey for deletion. */
>> +        *del_reason = FDR_FLOW_NO_STATS_IDLE;
>> +    } else if (need_revalidate) {
>>          if (should_revalidate(udpif, ukey, push.n_packets)) {
>>              if (!ukey->xcache) {
>>                  ukey->xcache = xlate_cache_new();
>>
>>
>> Others, thoughts?
>>
>>> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
>>> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
>>> +                                 "%p delta %llus", ukey,
>>> +                                 udpif_flow_time_delta(udpif, ukey));
>>> +                    result = UKEY_DELETE;
>>> +                    del_reason = FDR_FLOW_STALE;
>>>                  } else {
>>>                      struct dpif_flow_stats stats;
>>>                      COVERAGE_INC(revalidate_missed_dp_flow);
>>> -- 
>>> 2.21.0
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>
Eelco Chaudron June 3, 2024, 1:29 p.m. UTC | #6
On 3 Jun 2024, at 10:07, Roi Dayan wrote:

> On 03/06/2024 10:18, Roi Dayan wrote:
>>
>>
>> On 30/05/2024 18:48, Eelco Chaudron wrote:
>>>
>>>
>>> On 23 May 2024, at 12:46, Roi Dayan via dev wrote:
>>>
>>>> It is observed in some environments that there are much more ukeys than
>>>> actual DP flows. For example:
>>>>
>>>> $ ovs-appctl upcall/show
>>>> system@ovs-system:
>>>> flows : (current 7) (avg 6) (max 117) (limit 2125)
>>>> offloaded flows : 525
>>>> dump duration : 1063ms
>>>> ufid enabled : true
>>>>
>>>> 23: (keys 3612)
>>>> 24: (keys 3625)
>>>> 25: (keys 3485)
>>>>
>>>> The revalidator threads are busy revalidating the stale ukeys leading to
>>>> high CPU and long dump duration.
>>>>
>>>> There are some possible situations that may result in stale ukeys that
>>>> have no corresponding DP flows.
>>>>
>>>> In revalidator, push_dp_ops() doesn't check error if the op type is not
>>>> DEL. It is possible that a PUT(MODIFY) fails, especially for tc offload
>>>> case, where the old flow is deleted first and then the new one is
>>>> created. If the creation fails, the ukey will be stale (no corresponding
>>>> DP flow). This patch adds a warning in such case.
>>>
>>> Not sure I understand, this behavior should be captured by the UKEY_INCONSISTENT state.
>>
>> Hi Eelco,
>>
>> thanks for reviewing.
>> We started the patch on older branch as we didn't rebase for some time
>> and got a little later on sending it.
>> I see the addition now for UKEY_INCOSISTENT in the following patch:
>>
>> cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.
>>
>> Looks like it handles the same situation we tried to handle in this patch.
>>
>>>
>>>> Another possible scenario is in handle_upcalls() if a PUT operation did
>>>> not succeed and op->error attribute was not set correctly it can lead to
>>>> stale ukey in operational state.
>>>
>>>
>>> Guess we need to figure out these cases, as these are the root cause of your problem.
>>
>> right. maybe. but this could help keep the system alive/clean while logging the
>> bad situation instead of having increase in those stale/inconsistent ukeys.
>> I understand if it's not nice to handle it like this.
>>
>
> Hi Eelco,
>
> I remember now one of the reproduction scenarios we did is do some traffic
> to get rules added using TC and then delete those from tc command line
> and it got to stale ukeys.
> The revalidator dump thread not seeing the rules so not updating anything
> and sweep over the ukeys skipping them as well.
> This is why we checked against the timing stats of the ukey.
>
> I remember I tested on the upstream code and not our development branch
> and it reproduces. I didn't notice if the commit adding UKEY_INCONSISTENT
> existed but it handles errors from adding flows so I dont think it matters.
>
> I'll try to check and verify again but I think it's still here.
> So while cases getting dop.error handled with UKEY_INCONSISTENT,
> this case I think is not.

I think you are right this case is not covered by the UKEY_INCONSISTENT test below. See my suggestion on fixing this in revalidate_ukey().

Maybe you could also add a test case for this in the offload test suite.

//Eelco

>>>> This patch adds checks in the sweep phase for such ukeys and move them
>>>> to DELETE so that they can be cleared eventually.
>>>>
>>>> Co-authored-by: Han Zhou <hzhou@ovn.org>
>>>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>>>> Signed-off-by: Roi Dayan <roid@nvidia.com>
>>>> ---
>>>>  ofproto/ofproto-dpif-upcall.c | 15 +++++++++++++++
>>>>  1 file changed, 15 insertions(+)
>>>>
>>>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>>>> index 83609ec62b63..e9520ebdf910 100644
>>>> --- a/ofproto/ofproto-dpif-upcall.c
>>>> +++ b/ofproto/ofproto-dpif-upcall.c
>>>> @@ -57,6 +57,7 @@ COVERAGE_DEFINE(dumped_inconsistent_flow);
>>>>  COVERAGE_DEFINE(dumped_new_flow);
>>>>  COVERAGE_DEFINE(handler_duplicate_upcall);
>>>>  COVERAGE_DEFINE(revalidate_missed_dp_flow);
>>>> +COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
>>>>  COVERAGE_DEFINE(ukey_dp_change);
>>>>  COVERAGE_DEFINE(ukey_invalid_stat_reset);
>>>>  COVERAGE_DEFINE(ukey_replace_contention);
>>>> @@ -278,6 +279,7 @@ enum flow_del_reason {
>>>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>>>> +    FDR_FLOW_STALE,         /* Flow stale detected. */
>>>
>>> Please also update the associated script, see above comment.
>>>
>>>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>>>      FDR_PURGE,              /* User requested flow deletion. */
>>>> @@ -2557,6 +2559,10 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
>>>
>>>
>>> The (op->dop.error) condition below will never be reached, as it’s explicitly checked above. So the error message will never happen. The condition you explain is already covered by UKEY_INCONSISTENT. Not sure if we need a log message for this either at warn level, maybe debug, especially as you say it can happen often.
>>>
>>
>> right. as replied above. I didn't notice this when I did the cherry-pick as git didn't
>> show me a conflict in this area at all.
>> This commit was done before the change so we did hit this spot. we got little off
>> checking the commit and sending it on time for review. sorry about that.
>> cf11766cbcf1 ofproto-dpif-upcall: Fix push_dp_ops to handle all errors.
>>
>> I think we should redo our checks with the commit adding the UKEY_INCONSISTENT
>> state and see our result again.
>>
>> thanks for the review.
>>
>>
>>>>          if (op->dop.type != DPIF_OP_FLOW_DEL) {
>>>>              /* Only deleted flows need their stats pushed. */
>>>> +            if (op->dop.error) {
>>>> +                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
>>>> +                             "%p", op->dop.error, op->dop.type, op->ukey);
>>>> +            }
>>>>              continue;
>>>>          }
>>>>
>>>> @@ -3027,6 +3033,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge)
>>>>                      del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
>>>>                  } else if (!seq_mismatch) {
>>>>                      result = UKEY_KEEP;
>>>> +                } else if (!ukey->stats.used &&
>>>> +                           udpif_flow_time_delta(udpif, ukey) * 1000 >
>>>> +                           ofproto_max_idle) {
>>>
>>> In theory, this is a missed dp flow, i.e. the same as the else {} case below.
>>> Maybe we should change revalidate_ukey() to have a test for this?
>>>
>>>
>>> Maybe something like this (not tested):
>>>
>>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>>> index e9520ebdf..c34978fc9 100644
>>> --- a/ofproto/ofproto-dpif-upcall.c
>>> +++ b/ofproto/ofproto-dpif-upcall.c
>>> @@ -279,7 +279,7 @@ enum flow_del_reason {
>>>      FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
>>>      FDR_FLOW_IDLE,          /* Flow idle timeout. */
>>>      FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
>>> -    FDR_FLOW_STALE,         /* Flow stale detected. */
>>> +    FDR_FLOW_NO_STATS_IDLE, /* Flow idled out without receiving statistics. */
>>>      FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
>>>      FDR_NO_OFPROTO,         /* Bridge not found. */
>>>      FDR_PURGE,              /* User requested flow deletion. */
>>> @@ -2452,7 +2452,14 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
>>>          log_unexpected_stats_jump(ukey, stats);
>>>      }
>>>
>>> -    if (need_revalidate) {
>>> +    if (!ukey->stats.used
>>> +        && ukey->created < udpif->dpif->current_ms - ofproto_max_idle) {
>>> +        /* If the flow has a 'used' value of 0, meaning no stats were received
>>> +         * for this flow, and the configured idle time has elapsed, it might
>>> +         * indicates a stale flow (i.e., a flow without an installed datapath
>>> +         * rule). In this case, it is safe to mark this ukey for deletion. */
>>> +        *del_reason = FDR_FLOW_NO_STATS_IDLE;
>>> +    } else if (need_revalidate) {
>>>          if (should_revalidate(udpif, ukey, push.n_packets)) {
>>>              if (!ukey->xcache) {
>>>                  ukey->xcache = xlate_cache_new();
>>>
>>>
>>> Others, thoughts?
>>>
>>>> +                    COVERAGE_INC(revalidate_missed_dp_flow_del);
>>>> +                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
>>>> +                                 "%p delta %llus", ukey,
>>>> +                                 udpif_flow_time_delta(udpif, ukey));
>>>> +                    result = UKEY_DELETE;
>>>> +                    del_reason = FDR_FLOW_STALE;
>>>>                  } else {
>>>>                      struct dpif_flow_stats stats;
>>>>                      COVERAGE_INC(revalidate_missed_dp_flow);
>>>> -- 
>>>> 2.21.0
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>
diff mbox series

Patch

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index 83609ec62b63..e9520ebdf910 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -57,6 +57,7 @@  COVERAGE_DEFINE(dumped_inconsistent_flow);
 COVERAGE_DEFINE(dumped_new_flow);
 COVERAGE_DEFINE(handler_duplicate_upcall);
 COVERAGE_DEFINE(revalidate_missed_dp_flow);
+COVERAGE_DEFINE(revalidate_missed_dp_flow_del);
 COVERAGE_DEFINE(ukey_dp_change);
 COVERAGE_DEFINE(ukey_invalid_stat_reset);
 COVERAGE_DEFINE(ukey_replace_contention);
@@ -278,6 +279,7 @@  enum flow_del_reason {
     FDR_BAD_ODP_FIT,        /* Bad ODP flow fit. */
     FDR_FLOW_IDLE,          /* Flow idle timeout. */
     FDR_FLOW_LIMIT,         /* Kill all flows condition reached. */
+    FDR_FLOW_STALE,         /* Flow stale detected. */
     FDR_FLOW_WILDCARDED,    /* Flow needs a narrower wildcard mask. */
     FDR_NO_OFPROTO,         /* Bridge not found. */
     FDR_PURGE,              /* User requested flow deletion. */
@@ -2557,6 +2559,10 @@  push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops)
 
         if (op->dop.type != DPIF_OP_FLOW_DEL) {
             /* Only deleted flows need their stats pushed. */
+            if (op->dop.error) {
+                VLOG_WARN_RL(&rl, "push_dp_ops: error %d in op type %d, ukey "
+                             "%p", op->dop.error, op->dop.type, op->ukey);
+            }
             continue;
         }
 
@@ -3027,6 +3033,15 @@  revalidator_sweep__(struct revalidator *revalidator, bool purge)
                     del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL;
                 } else if (!seq_mismatch) {
                     result = UKEY_KEEP;
+                } else if (!ukey->stats.used &&
+                           udpif_flow_time_delta(udpif, ukey) * 1000 >
+                           ofproto_max_idle) {
+                    COVERAGE_INC(revalidate_missed_dp_flow_del);
+                    VLOG_WARN_RL(&rl, "revalidator_sweep__: Remove stale ukey "
+                                 "%p delta %llus", ukey,
+                                 udpif_flow_time_delta(udpif, ukey));
+                    result = UKEY_DELETE;
+                    del_reason = FDR_FLOW_STALE;
                 } else {
                     struct dpif_flow_stats stats;
                     COVERAGE_INC(revalidate_missed_dp_flow);