[ovs-dev,PATCHv3,2/2] xlate: add backward resubmit trace log.

Message ID 1519864348-57607-2-git-send-email-u9012063@gmail.com
State New
Headers show
Series
  • [ovs-dev,PATCHv3,1/2] xlate: auto ofproto trace when recursion too deep
Related show

Commit Message

William Tu March 1, 2018, 12:32 a.m.
When hitting max translation depth of 64, printing the full
ofproto trace is huge and consumes too much log.  The patch
adds a short trace log, which prints the top 3 table id and
its count of the 64 backward resubmits during the translation.

VMWare-BZ: #2054659
Signed-off-by: William Tu <u9012063@gmail.com>
---
 ofproto/ofproto-dpif-xlate.c | 45 ++++++++++++++++++++++++++++++++++++++++++--
 tests/ofproto-dpif.at        |  3 +++
 2 files changed, 46 insertions(+), 2 deletions(-)

Comments

Ben Pfaff March 6, 2018, 1:03 a.m. | #1
On Wed, Feb 28, 2018 at 04:32:28PM -0800, William Tu wrote:
> When hitting max translation depth of 64, printing the full
> ofproto trace is huge and consumes too much log.  The patch
> adds a short trace log, which prints the top 3 table id and
> its count of the 64 backward resubmits during the translation.
> 
> VMWare-BZ: #2054659
> Signed-off-by: William Tu <u9012063@gmail.com>

Thanks for working on this!  It will be useful from time to time when a
controller makes some kind of terrible mistake ;-)

I don't think that the counting logic was correct here.  It had tables
0, 1, and 2 hard-coded, but I have no reason to believe that backward
loops always occur through those tables.  I think that we need to keep
track of the tables and their frequencies independently.  Here is my
idea of how to do that.  What do you think?

--8<--------------------------cut here-------------------------->8--

From: William Tu <u9012063@gmail.com>
Date: Wed, 28 Feb 2018 16:32:28 -0800
Subject: [PATCH] xlate: add backward resubmit trace log.

When hitting max translation depth of 64, printing the full
ofproto trace is huge and consumes too much log.  The patch
adds a short trace log, which prints the top 3 table id and
its count of the 64 backward resubmits during the translation.

VMWare-BZ: #2054659
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
 ofproto/ofproto-dpif-xlate.c | 64 ++++++++++++++++++++++++++++++++++++++++++--
 tests/ofproto-dpif.at        |  5 ++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index cc450a896948..65ede267dd0e 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -237,6 +237,7 @@ struct xlate_ctx {
      */
     int depth;                  /* Current resubmit nesting depth. */
     int resubmits;              /* Total number of resubmits. */
+    uint8_t bw_resubmit[MAX_DEPTH]; /* Track backward resubmit table IDs. */
     bool in_group;              /* Currently translating ofgroup, if true. */
     bool in_action_set;         /* Currently translating action_set, if true. */
     bool in_packet_out;         /* Currently translating a packet_out msg, if
@@ -3912,8 +3913,11 @@ xlate_recursively(struct xlate_ctx *ctx, struct rule_dpif *rule,
         rule_dpif_credit_stats(rule, ctx->xin->resubmit_stats);
     }
 
-    ctx->resubmits++;
+    if (deepens) {
+        ctx->bw_resubmit[ctx->depth] = ctx->table_id;
+    }
 
+    ctx->resubmits++;
     ctx->depth += deepens;
     ctx->rule = rule;
     ctx->rule_cookie = rule->up.flow_cookie;
@@ -3925,11 +3929,67 @@ xlate_recursively(struct xlate_ctx *ctx, struct rule_dpif *rule,
     ctx->depth -= deepens;
 }
 
+struct table_count {
+    uint8_t table;
+    uint8_t count;
+};
+
+static int
+compare_table_count(const void *a_, const void *b_)
+{
+    const struct table_count *a = a_;
+    const struct table_count *b = b_;
+
+    if (a->count != b->count) {
+        /* Sort higher counts earlier. */
+        return a->count < b->count ? 1 : -1;
+    } else {
+        /* Break ties by sorting lower-numbered tables earlier. */
+        return a->table < b->table ? -1 : 1;
+    }
+}
+
+static struct ds
+format_bw_resubmit(uint8_t *bw_resubmit)
+{
+    struct ds output = DS_EMPTY_INITIALIZER;
+
+    struct table_count tc[256];
+    for (int i = 0; i < 256; i++) {
+        tc[i].table = i;
+        tc[i].count = 0;
+    }
+    for (int i = 0; i < MAX_DEPTH; i++) {
+        tc[bw_resubmit[i]].count++;
+    }
+    qsort(tc, 256, sizeof *tc, compare_table_count);
+
+    /* Report the largest. */
+    ds_put_format(&output, "translation looped backward %d times through "
+                  "table %d", tc[0].count, tc[0].table);
+    for (int i = 1; i <= 2; i++) {
+        if (!tc[i].count) {
+            break;
+        }
+        ds_put_format(&output, ", %d times through table %d",
+                      tc[i].count, tc[i].table);
+    }
+    if (tc[3].count) {
+        ds_put_format(&output, ", and additional times through other tables");
+    }
+    return output;
+}
+
 static bool
 xlate_resubmit_resource_check(struct xlate_ctx *ctx)
 {
     if (ctx->depth >= MAX_DEPTH) {
-        xlate_report_error(ctx, "over max translation depth %d", MAX_DEPTH);
+        struct ds output;
+
+        output = format_bw_resubmit(ctx->bw_resubmit);
+        xlate_report_error(ctx, "over max translation depth %d (%s)",
+                                MAX_DEPTH, ds_cstr(&output));
+        ds_destroy(&output);
         ctx->error = XLATE_RECURSION_TOO_DEEP;
     } else if (ctx->resubmits >= MAX_RESUBMITS) {
         xlate_report_error(ctx, "over %d resubmit actions", MAX_RESUBMITS);
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 48cbc672d204..d52cc0e09ccc 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -8494,6 +8494,11 @@ AT_CHECK([grep -c "^ *resubmit" ovs-vswitchd.log],
   [0], [66
 ])
 
+dnl Make sure the short backward table trace is present.
+dnl (It appears in the log twice, once in the trace and once outside.)
+AT_CHECK([grep -c "(translation looped backward 1 times through table 2, 1 times through table 3, 1 times through table 4, and additional times through other tables)" ovs-vswitchd.log], [0], [2
+])
+
 OVS_VSWITCHD_STOP(["/over max translation depth/d
 /ofproto_dpif_upcall/d"])
 AT_CLEANUP
Ben Pfaff March 6, 2018, 1:06 a.m. | #2
On Wed, Feb 28, 2018 at 04:32:28PM -0800, William Tu wrote:
> When hitting max translation depth of 64, printing the full
> ofproto trace is huge and consumes too much log.  The patch
> adds a short trace log, which prints the top 3 table id and
> its count of the 64 backward resubmits during the translation.
> 
> VMWare-BZ: #2054659
> Signed-off-by: William Tu <u9012063@gmail.com>

Thanks for the fix!  I applied this to master and branch-2.9.
William Tu March 6, 2018, 1:16 a.m. | #3
On Mon, Mar 5, 2018 at 5:06 PM, Ben Pfaff <blp@ovn.org> wrote:
> On Wed, Feb 28, 2018 at 04:32:28PM -0800, William Tu wrote:
>> When hitting max translation depth of 64, printing the full
>> ofproto trace is huge and consumes too much log.  The patch
>> adds a short trace log, which prints the top 3 table id and
>> its count of the 64 backward resubmits during the translation.
>>
>> VMWare-BZ: #2054659
>> Signed-off-by: William Tu <u9012063@gmail.com>
>
> Thanks for the fix!  I applied this to master and branch-2.9.

Thanks.
William

Patch

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index cc450a896948..f82fe6bf711f 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -237,6 +237,7 @@  struct xlate_ctx {
      */
     int depth;                  /* Current resubmit nesting depth. */
     int resubmits;              /* Total number of resubmits. */
+    uint8_t bw_resubmit[MAX_DEPTH]; /* Track backward resubmit table IDs. */
     bool in_group;              /* Currently translating ofgroup, if true. */
     bool in_action_set;         /* Currently translating action_set, if true. */
     bool in_packet_out;         /* Currently translating a packet_out msg, if
@@ -3912,8 +3913,11 @@  xlate_recursively(struct xlate_ctx *ctx, struct rule_dpif *rule,
         rule_dpif_credit_stats(rule, ctx->xin->resubmit_stats);
     }
 
-    ctx->resubmits++;
+    if (deepens) {
+        ctx->bw_resubmit[ctx->depth] = ctx->table_id;
+    }
 
+    ctx->resubmits++;
     ctx->depth += deepens;
     ctx->rule = rule;
     ctx->rule_cookie = rule->up.flow_cookie;
@@ -3925,11 +3929,48 @@  xlate_recursively(struct xlate_ctx *ctx, struct rule_dpif *rule,
     ctx->depth -= deepens;
 }
 
+static int
+compare_uint8(const void *a_, const void *b_)
+{
+    const uint8_t *a = a_;
+    const uint8_t *b = b_;
+    return *a > *b ? -1 : *a < *b;
+}
+
+static struct ds
+format_bw_resubmit(uint8_t *bw_resubmit)
+{
+    int i;
+    uint8_t oftable[256];
+    struct ds output;
+
+    ds_init(&output);
+    memset(oftable, 0x0, sizeof oftable);
+
+    for (i = 0; i < MAX_DEPTH; i++) {
+        uint8_t tab_id = bw_resubmit[i];
+        oftable[tab_id]++;
+    }
+    qsort(oftable, 256, sizeof(uint8_t), compare_uint8);
+
+    /* find the largest */
+    ds_put_format(&output, "(%d,%d),", 0, oftable[0]);
+    ds_put_format(&output, "(%d,%d),", 1, oftable[1]);
+    ds_put_format(&output, "(%d,%d)", 2, oftable[2]);
+    return output;
+}
+
 static bool
 xlate_resubmit_resource_check(struct xlate_ctx *ctx)
 {
     if (ctx->depth >= MAX_DEPTH) {
-        xlate_report_error(ctx, "over max translation depth %d", MAX_DEPTH);
+        struct ds output;
+
+        output = format_bw_resubmit(ctx->bw_resubmit);
+        xlate_report_error(ctx, "over max translation depth %d, "
+                                "top 3 (table_id,count): [%s]",
+                                MAX_DEPTH, ds_cstr(&output));
+        ds_destroy(&output);
         ctx->error = XLATE_RECURSION_TOO_DEEP;
     } else if (ctx->resubmits >= MAX_RESUBMITS) {
         xlate_report_error(ctx, "over %d resubmit actions", MAX_RESUBMITS);
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index fd78c5d9b593..3776eca75e99 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -8494,6 +8494,9 @@  AT_CHECK([grep -c "^ *resubmit" ovs-vswitchd.log],
   [0], [66
 ])
 
+dnl make sure the short backward table trace is present
+AT_CHECK([egrep "\(table_id,count\): \[[\(0,1\),\(1,1\),\(2,1\)\]]" ovs-vswitchd.log], [0], [stdout])
+
 OVS_VSWITCHD_STOP(["/over max translation depth/d
 /ofproto_dpif_upcall/d"])
 AT_CLEANUP