Message ID | 1523888788-20288-3-git-send-email-jan.scheurich@ericsson.com |
---|---|
State | Changes Requested |
Headers | show |
Series | Use improved dp_hash select group by default | expand |
Hi, Jan: I think the following code should also be modified + for (int hash = 0; hash < n_hash; hash++) { + double max_val = 0.0; + struct webster *winner; + for (i = 0; i < n_buckets; i++) { + if (webster[i].value > max_val) { =======================> if bucket->weight=0, and there is only one bucket with weight equal to 0, then winner will be null + max_val = webster[i].value; + winner = &webster[i]; + } + } Test like this command: ovs-ofctl add-group br-int -O openflow15 "group_id=2,type=select,selection_method=dp_hash,bucket=bucket_id=1,weight=0,actions=output:10" vswitchd crashed after command put. At 2018-04-16 22:26:27, "Jan Scheurich" <jan.scheurich@ericsson.com> wrote: >The current implementation of the "dp_hash" selection method suffers >from two deficiences: 1. The hash mask and hence the number of dp_hash >values is just large enough to cover the number of group buckets, but >does not consider the case that buckets have different weights. 2. The >xlate-time selection of best bucket from the masked dp_hash value often >results in bucket load distributions that are quite different from the >bucket weights because the number of available masked dp_hash values >is too small (2-6 bits compared to 32 bits of a full hash in the default >hash selection method). > >This commit provides a more accurate implementation of the dp_hash >select group by applying the well known Webster method for distributing >a small number of "seats" fairly over the weighted "parties" >(see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). >The dp_hash mask is autmatically chosen large enough to provide good >enough accuracy even with widely differing weights. > >This distribution happens at group modification time and the resulting >table is stored with the group-dpif struct. At xlation time, we use the >masked dp_hash values as index to look up the assigned bucket. > >If the bucket should not be live, we do a circular search over the >mapping table until we find the first live bucket. As the buckets in >the table are by construction in pseudo-random order with a frequency >according to their weight, this method maintains correct distribution >even if one or more buckets are non-live. > >Xlation is further simplified by storing some derived select group state >at group construction in struct group-dpif in a form better suited for >xlation purposes. > >Adapted the unit test case for dp_hash select group accordingly. > >Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> >Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> >Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> >--- > include/openvswitch/ofp-group.h | 1 + > ofproto/ofproto-dpif-xlate.c | 74 +++++++++++++------- > ofproto/ofproto-dpif.c | 146 ++++++++++++++++++++++++++++++++++++++++ > ofproto/ofproto-dpif.h | 13 ++++ > tests/ofproto-dpif.at | 18 +++-- > 5 files changed, 221 insertions(+), 31 deletions(-) > >diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h >index 8d893a5..af4033d 100644 >--- a/include/openvswitch/ofp-group.h >+++ b/include/openvswitch/ofp-group.h >@@ -47,6 +47,7 @@ struct bucket_counter { > /* Bucket for use in groups. */ > struct ofputil_bucket { > struct ovs_list list_node; >+ uint16_t aux; /* Padding. Also used for temporary data. */ > uint16_t weight; /* Relative weight, for "select" groups. */ > ofp_port_t watch_port; /* Port whose state affects whether this bucket > * is live. Only required for fast failover >diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c >index c8baba1..df245c5 100644 >--- a/ofproto/ofproto-dpif-xlate.c >+++ b/ofproto/ofproto-dpif-xlate.c >@@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > } > } > >+static struct ofputil_bucket * >+group_dp_hash_best_bucket(struct xlate_ctx *ctx, >+ const struct group_dpif *group, >+ uint32_t dp_hash) >+{ >+ struct ofputil_bucket *bucket, *best_bucket = NULL; >+ uint32_t n_hash = group->hash_mask + 1; >+ >+ uint32_t hash = dp_hash &= group->hash_mask; >+ ctx->wc->masks.dp_hash |= group->hash_mask; >+ >+ /* Starting from the original masked dp_hash value iterate over the >+ * hash mapping table to find the first live bucket. As the buckets >+ * are quasi-randomly spread over the hash values, this maintains >+ * a distribution according to bucket weights even when some buckets >+ * are non-live. */ >+ for (int i = 0; i < n_hash; i++) { >+ bucket = group->hash_map[(hash + i) % n_hash]; >+ if (bucket_is_alive(ctx, bucket, 0)) { >+ best_bucket = bucket; >+ break; >+ } >+ } >+ >+ return best_bucket; >+} >+ > static void > xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > bool is_last_action) > { >- struct ofputil_bucket *bucket; >- > /* dp_hash value 0 is special since it means that the dp_hash has not been > * computed, as all computed dp_hash values are non-zero. Therefore > * compare to zero can be used to decide if the dp_hash value is valid > * without masking the dp_hash field. */ > if (!ctx->xin->flow.dp_hash) { >- uint64_t param = group->up.props.selection_method_param; >- >- ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param); >+ ctx_trigger_recirculate_with_hash(ctx, group->hash_alg, >+ group->hash_basis); >+ if (ctx->xin->xcache) { >+ ofproto_group_unref(&group->up); >+ } > } else { >- uint32_t n_buckets = group->up.n_buckets; >- if (n_buckets) { >- /* Minimal mask to cover the number of buckets. */ >- uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1; >- /* Multiplier chosen to make the trivial 1 bit case to >- * actually distribute amongst two equal weight buckets. */ >- uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask); >- >- ctx->wc->masks.dp_hash |= mask; >- bucket = group_best_live_bucket(ctx, group, basis); >- if (bucket) { >- xlate_group_bucket(ctx, bucket, is_last_action); >- xlate_group_stats(ctx, group, bucket); >- } >+ struct ofputil_bucket *bucket = >+ group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash); >+ if (bucket) { >+ xlate_group_bucket(ctx, bucket, is_last_action); >+ xlate_group_stats(ctx, group, bucket); >+ } else if (ctx->xin->xcache) { >+ ofproto_group_unref(&group->up); > } > } > } >@@ -4272,8 +4292,6 @@ static void > xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > bool is_last_action) > { >- const char *selection_method = group->up.props.selection_method; >- > /* Select groups may access flow keys beyond L2 in order to > * select a bucket. Recirculate as appropriate to make this possible. > */ >@@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > ctx_trigger_freeze(ctx); > } > >- if (selection_method[0] == '\0') { >+ switch (group->selection_method) { >+ case SEL_METHOD_DEFAULT: > xlate_default_select_group(ctx, group, is_last_action); >- } else if (!strcasecmp("hash", selection_method)) { >+ break; >+ case SEL_METHOD_HASH: > xlate_hash_fields_select_group(ctx, group, is_last_action); >- } else if (!strcasecmp("dp_hash", selection_method)) { >+ break; >+ case SEL_METHOD_DP_HASH: > xlate_dp_hash_select_group(ctx, group, is_last_action); >- } else { >- /* Parsing of groups should ensure this never happens */ >+ break; >+ default: > OVS_NOT_REACHED(); >+ break; > } > } > >diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c >index 1ed82d0..e1a5c09 100644 >--- a/ofproto/ofproto-dpif.c >+++ b/ofproto/ofproto-dpif.c >@@ -32,6 +32,7 @@ > #include "lacp.h" > #include "learn.h" > #include "mac-learning.h" >+#include "math.h" > #include "mcast-snooping.h" > #include "multipath.h" > #include "netdev-vport.h" >@@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group, > ovs_mutex_unlock(&group->stats_mutex); > } > >+/* Calculate the dp_hash mask needed to provide the least weighted bucket >+ * with at least one hash value and construct a mapping table from masked >+ * dp_hash value to group bucket using the Webster method. >+ * If the caller specifies a non-zero max_hash value, abort and return false >+ * if more hash values would be required. The absolute maximum number of >+ * hash values supported is 256. */ >+ >+#define MAX_SELECT_GROUP_HASH_VALUES 256 >+ >+static bool >+group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash) >+{ >+ struct ofputil_bucket *bucket; >+ uint32_t n_buckets = group->up.n_buckets; >+ double total_weight = 0.0; >+ uint16_t min_weight = UINT16_MAX; >+ uint32_t n_hash; >+ struct webster { >+ struct ofputil_bucket *bucket; >+ uint32_t divisor; >+ double value; >+ } webster[group->up.n_buckets]; >+ >+ if (n_buckets == 0) { >+ VLOG_DBG(" Don't apply dp_hash method without buckets"); >+ return false; >+ } >+ >+ int i = 0; >+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { >+ bucket->aux = 0; >+ if (bucket->weight > 0 && bucket->weight < min_weight) { >+ min_weight = bucket->weight; >+ } >+ total_weight += bucket->weight; >+ webster[i].bucket = bucket; >+ webster[i].divisor = 1; >+ webster[i].value = bucket->weight; >+ i++; >+ } >+ >+ VLOG_DBG(" Minimum weight: %d, total weight: %.0f", >+ min_weight, total_weight); >+ >+ uint32_t min_slots = ceil(total_weight / min_weight); >+ n_hash = MAX(16, 1L << log_2_ceil(min_slots)); >+ >+ if (n_hash > MAX_SELECT_GROUP_HASH_VALUES || >+ (max_hash != 0 && n_hash > max_hash)) { >+ VLOG_DBG(" Too many hash values required: %d", n_hash); >+ return false; >+ } >+ >+ VLOG_DBG(" Using %d hash values:", n_hash); >+ group->hash_mask = n_hash - 1; >+ if (group->hash_map) { >+ free(group->hash_map); >+ } >+ group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *)); >+ >+ /* Use Webster method to distribute hash values over buckets. */ >+ for (int hash = 0; hash < n_hash; hash++) { >+ double max_val = 0.0; >+ struct webster *winner; >+ for (i = 0; i < n_buckets; i++) { >+ if (webster[i].value > max_val) { >+ max_val = webster[i].value; >+ winner = &webster[i]; >+ } >+ } >+#pragma GCC diagnostic push >+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" >+ /* winner is a reference to a webster[] element initialized above. */ >+ winner->divisor += 2; >+ winner->value = (double) winner->bucket->weight / winner->divisor; >+ group->hash_map[hash] = winner->bucket; >+ winner->bucket->aux++; >+#pragma GCC diagnostic pop >+ } >+ >+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { >+ double target = (n_hash * bucket->weight) / total_weight; >+ VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d", >+ bucket->bucket_id, bucket->weight, >+ target, bucket->aux); >+ } >+ >+ return true; >+} >+ >+static void >+group_set_selection_method(struct group_dpif *group) >+{ >+ struct ofputil_group_props *props = &group->up.props; >+ char *selection_method = props->selection_method; >+ >+ if (selection_method[0] == '\0') { >+ VLOG_INFO("No selection method specified."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ >+ } else if (!strcmp(selection_method, "dp_hash")) { >+ VLOG_INFO("Selection method specified: dp_hash."); >+ /* Try to use dp_hash if possible at all. */ >+ if (group_setup_dp_hash_table(group, 0)) { >+ group->selection_method = SEL_METHOD_DP_HASH; >+ group->hash_alg = props->selection_method_param >> 32; >+ if (group->hash_alg >= __OVS_HASH_MAX) { >+ VLOG_INFO(" Invalid dp_hash algorithm %d. " >+ "Defaulting to OVS_HASH_ALG_L4", group->hash_alg); >+ group->hash_alg = OVS_HASH_ALG_L4; >+ } >+ group->hash_basis = (uint32_t) props->selection_method_param; >+ } else { >+ /* Fall back to original default hashing in slow path. */ >+ VLOG_INFO(" Falling back to default hash method."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ } >+ } else if (!strcmp(selection_method, "hash")) { >+ VLOG_INFO("Selection method specified: hash."); >+ if (props->fields.values_size > 0) { >+ /* Controller has specified hash fields. */ >+ struct ds s = DS_EMPTY_INITIALIZER; >+ oxm_format_field_array(&s, &props->fields); >+ VLOG_INFO(" Hash fields: %s", ds_cstr(&s)); >+ ds_destroy(&s); >+ group->selection_method = SEL_METHOD_HASH; >+ } else { >+ /* No hash fields. Fall back to original default hashing. */ >+ VLOG_INFO(" No hash fields. Falling back to default hash method."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ } >+ } else { >+ /* Parsing of groups should ensure this never happens */ >+ OVS_NOT_REACHED(); >+ } >+} >+ > static enum ofperr > group_construct(struct ofgroup *group_) > { >@@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_) > ovs_mutex_init_adaptive(&group->stats_mutex); > ovs_mutex_lock(&group->stats_mutex); > group_construct_stats(group); >+ group->hash_map = NULL; >+ if (group->up.type == OFPGT11_SELECT) { >+ group_set_selection_method(group); >+ } > ovs_mutex_unlock(&group->stats_mutex); > return 0; > } >@@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_) > { > struct group_dpif *group = group_dpif_cast(group_); > ovs_mutex_destroy(&group->stats_mutex); >+ if (group->hash_map) { >+ free(group->hash_map); >+ group->hash_map = NULL; >+ } > } > > static enum ofperr >diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h >index 47bf7f9..880dd3d 100644 >--- a/ofproto/ofproto-dpif.h >+++ b/ofproto/ofproto-dpif.h >@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule) > > /* Groups. */ > >+enum group_selection_method { >+ SEL_METHOD_DEFAULT, >+ SEL_METHOD_DP_HASH, >+ SEL_METHOD_HASH, >+}; >+ > struct group_dpif { > struct ofgroup up; > >@@ -129,6 +135,12 @@ struct group_dpif { > struct ovs_mutex stats_mutex; > uint64_t packet_count OVS_GUARDED; /* Number of packets received. */ > uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */ >+ >+ enum group_selection_method selection_method; >+ enum ovs_hash_alg hash_alg; /* dp_hash algorithm to be applied. */ >+ uint32_t hash_basis; /* Basis for dp_hash. */ >+ uint32_t hash_mask; /* Used to mask dp_hash (2^N - 1).*/ >+ struct ofputil_bucket **hash_map; /* Map hash values to buckets. */ > }; > > void group_dpif_credit_stats(struct group_dpif *, >@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *, > struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, > uint32_t group_id, ovs_version_t version, > bool take_ref); >+ > > /* Backers. > * >diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at >index 60f28e2..6c20208 100644 >--- a/tests/ofproto-dpif.at >+++ b/tests/ofproto-dpif.at >@@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do > AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) > done > >-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl >+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl > flow-dump from non-dpdk interfaces: > recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1) >-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 > ]) > > AT_CHECK([ovs-appctl revalidator/purge], [0]) >@@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do > AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) > done > >-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl >+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl > flow-dump from non-dpdk interfaces: > recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2) >-recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X >+recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X > ]) > > OVS_VSWITCHD_STOP >-- >1.9.1
Hi Ychen,
Thank you for finding yet another corner case. I will fix it in the next version with the following incremental:
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 8f71083..674b3b5 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4762,6 +4762,11 @@ group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash)
VLOG_DBG(" Minimum weight: %d, total weight: %.0f",
min_weight, total_weight);
+ if (total_weight == 0) {
+ VLOG_DBG(" Total weight is zero. No active buckets.");
+ return false;
+ }
+
uint32_t min_slots = ceil(total_weight / min_weight);
n_hash = MAX(16, 1L << log_2_ceil(min_slots));
I would like to mention your contribution with Tested-By: tag in the commit messages. Would that be ok? What is your real name I should put?
BR, Jan
From: ychen [mailto:ychen103103@163.com]
Sent: Tuesday, 17 April, 2018 13:22
To: Jan Scheurich <jan.scheurich@ericsson.com>
Cc: dev@openvswitch.org; Nitin Katiyar <nitin.katiyar@ericsson.com>; blp@ovn.org
Subject: Re:[PATCH v2 2/3] ofproto-dpif: Improve dp_hash selection method for select groups
Hi, Jan:
I think the following code should also be modified
+ for (int hash = 0; hash < n_hash; hash++) {
+ double max_val = 0.0;
+ struct webster *winner;
+ for (i = 0; i < n_buckets; i++) {
+ if (webster[i].value > max_val) { =======================> if bucket->weight=0, and there is only one bucket with weight equal to 0, then winner will be null
+ max_val = webster[i].value;
+ winner = &webster[i];
+ }
+ }
Test like this command:
ovs-ofctl add-group br-int -O openflow15 "group_id=2,type=select,selection_method=dp_hash,bucket=bucket_id=1,weight=0,actions=output:10"
vswitchd crashed after command put.
On Mon, Apr 16, 2018 at 04:26:27PM +0200, Jan Scheurich wrote: > The current implementation of the "dp_hash" selection method suffers > from two deficiences: 1. The hash mask and hence the number of dp_hash > values is just large enough to cover the number of group buckets, but > does not consider the case that buckets have different weights. 2. The > xlate-time selection of best bucket from the masked dp_hash value often > results in bucket load distributions that are quite different from the > bucket weights because the number of available masked dp_hash values > is too small (2-6 bits compared to 32 bits of a full hash in the default > hash selection method). Clang gives me the following errors: ../ofproto/ofproto-dpif.c:4792:32: error: unknown warning group '-Wmaybe-uninitialized', ignored [-Werror,-Wunknown-pragmas] ../ofproto/ofproto-dpif.c:4814:33: error: initializing 'struct ofputil_group_props *' with an expression of type 'const struct ofputil_group_props *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] "sparse" gives me the following errors: ../ofproto/ofproto-dpif.c:4742:24: error: Variable length array is used. ../ofproto/ofproto-dpif.c:4814:42: error: incorrect type in initializer (different modifiers) ../ofproto/ofproto-dpif.c:4814:42: expected struct ofputil_group_props *props ../ofproto/ofproto-dpif.c:4814:42: got struct ofputil_group_props const *<noident> Thanks, Ben.
On Tue, Apr 17, 2018 at 01:03:17PM -0700, Ben Pfaff wrote: > On Mon, Apr 16, 2018 at 04:26:27PM +0200, Jan Scheurich wrote: > > The current implementation of the "dp_hash" selection method suffers > > from two deficiences: 1. The hash mask and hence the number of dp_hash > > values is just large enough to cover the number of group buckets, but > > does not consider the case that buckets have different weights. 2. The > > xlate-time selection of best bucket from the masked dp_hash value often > > results in bucket load distributions that are quite different from the > > bucket weights because the number of available masked dp_hash values > > is too small (2-6 bits compared to 32 bits of a full hash in the default > > hash selection method). > > Clang gives me the following errors: > > ../ofproto/ofproto-dpif.c:4792:32: error: unknown warning group '-Wmaybe-uninitialized', ignored [-Werror,-Wunknown-pragmas] > ../ofproto/ofproto-dpif.c:4814:33: error: initializing 'struct ofputil_group_props *' with an expression of type 'const struct ofputil_group_props *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] How about this approach, which should cleanly eliminate the warning? diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index e1a5c097f3aa..362339a4abb4 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -4780,22 +4780,17 @@ group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash) /* Use Webster method to distribute hash values over buckets. */ for (int hash = 0; hash < n_hash; hash++) { - double max_val = 0.0; - struct webster *winner; - for (i = 0; i < n_buckets; i++) { - if (webster[i].value > max_val) { - max_val = webster[i].value; + struct webster *winner = &webster[0]; + for (i = 1; i < n_buckets; i++) { + if (webster[i].value > winner->value) { winner = &webster[i]; } } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* winner is a reference to a webster[] element initialized above. */ winner->divisor += 2; winner->value = (double) winner->bucket->weight / winner->divisor; group->hash_map[hash] = winner->bucket; winner->bucket->aux++; -#pragma GCC diagnostic pop } LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
> How about this approach, which should cleanly eliminate the warning? > > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c > index e1a5c097f3aa..362339a4abb4 100644 > --- a/ofproto/ofproto-dpif.c > +++ b/ofproto/ofproto-dpif.c > @@ -4780,22 +4780,17 @@ group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash) > > /* Use Webster method to distribute hash values over buckets. */ > for (int hash = 0; hash < n_hash; hash++) { > - double max_val = 0.0; > - struct webster *winner; > - for (i = 0; i < n_buckets; i++) { > - if (webster[i].value > max_val) { > - max_val = webster[i].value; > + struct webster *winner = &webster[0]; > + for (i = 1; i < n_buckets; i++) { > + if (webster[i].value > winner->value) { > winner = &webster[i]; > } > } > -#pragma GCC diagnostic push > -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" > /* winner is a reference to a webster[] element initialized above. */ > winner->divisor += 2; > winner->value = (double) winner->bucket->weight / winner->divisor; > group->hash_map[hash] = winner->bucket; > winner->bucket->aux++; > -#pragma GCC diagnostic pop > } Thank you, Ben, for your thorough checks. Yes, your approach is better and compiles w/o warnings. Regards, Jan
diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h index 8d893a5..af4033d 100644 --- a/include/openvswitch/ofp-group.h +++ b/include/openvswitch/ofp-group.h @@ -47,6 +47,7 @@ struct bucket_counter { /* Bucket for use in groups. */ struct ofputil_bucket { struct ovs_list list_node; + uint16_t aux; /* Padding. Also used for temporary data. */ uint16_t weight; /* Relative weight, for "select" groups. */ ofp_port_t watch_port; /* Port whose state affects whether this bucket * is live. Only required for fast failover diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index c8baba1..df245c5 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group, } } +static struct ofputil_bucket * +group_dp_hash_best_bucket(struct xlate_ctx *ctx, + const struct group_dpif *group, + uint32_t dp_hash) +{ + struct ofputil_bucket *bucket, *best_bucket = NULL; + uint32_t n_hash = group->hash_mask + 1; + + uint32_t hash = dp_hash &= group->hash_mask; + ctx->wc->masks.dp_hash |= group->hash_mask; + + /* Starting from the original masked dp_hash value iterate over the + * hash mapping table to find the first live bucket. As the buckets + * are quasi-randomly spread over the hash values, this maintains + * a distribution according to bucket weights even when some buckets + * are non-live. */ + for (int i = 0; i < n_hash; i++) { + bucket = group->hash_map[(hash + i) % n_hash]; + if (bucket_is_alive(ctx, bucket, 0)) { + best_bucket = bucket; + break; + } + } + + return best_bucket; +} + static void xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group, bool is_last_action) { - struct ofputil_bucket *bucket; - /* dp_hash value 0 is special since it means that the dp_hash has not been * computed, as all computed dp_hash values are non-zero. Therefore * compare to zero can be used to decide if the dp_hash value is valid * without masking the dp_hash field. */ if (!ctx->xin->flow.dp_hash) { - uint64_t param = group->up.props.selection_method_param; - - ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param); + ctx_trigger_recirculate_with_hash(ctx, group->hash_alg, + group->hash_basis); + if (ctx->xin->xcache) { + ofproto_group_unref(&group->up); + } } else { - uint32_t n_buckets = group->up.n_buckets; - if (n_buckets) { - /* Minimal mask to cover the number of buckets. */ - uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1; - /* Multiplier chosen to make the trivial 1 bit case to - * actually distribute amongst two equal weight buckets. */ - uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask); - - ctx->wc->masks.dp_hash |= mask; - bucket = group_best_live_bucket(ctx, group, basis); - if (bucket) { - xlate_group_bucket(ctx, bucket, is_last_action); - xlate_group_stats(ctx, group, bucket); - } + struct ofputil_bucket *bucket = + group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash); + if (bucket) { + xlate_group_bucket(ctx, bucket, is_last_action); + xlate_group_stats(ctx, group, bucket); + } else if (ctx->xin->xcache) { + ofproto_group_unref(&group->up); } } } @@ -4272,8 +4292,6 @@ static void xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group, bool is_last_action) { - const char *selection_method = group->up.props.selection_method; - /* Select groups may access flow keys beyond L2 in order to * select a bucket. Recirculate as appropriate to make this possible. */ @@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group, ctx_trigger_freeze(ctx); } - if (selection_method[0] == '\0') { + switch (group->selection_method) { + case SEL_METHOD_DEFAULT: xlate_default_select_group(ctx, group, is_last_action); - } else if (!strcasecmp("hash", selection_method)) { + break; + case SEL_METHOD_HASH: xlate_hash_fields_select_group(ctx, group, is_last_action); - } else if (!strcasecmp("dp_hash", selection_method)) { + break; + case SEL_METHOD_DP_HASH: xlate_dp_hash_select_group(ctx, group, is_last_action); - } else { - /* Parsing of groups should ensure this never happens */ + break; + default: OVS_NOT_REACHED(); + break; } } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 1ed82d0..e1a5c09 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -32,6 +32,7 @@ #include "lacp.h" #include "learn.h" #include "mac-learning.h" +#include "math.h" #include "mcast-snooping.h" #include "multipath.h" #include "netdev-vport.h" @@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group, ovs_mutex_unlock(&group->stats_mutex); } +/* Calculate the dp_hash mask needed to provide the least weighted bucket + * with at least one hash value and construct a mapping table from masked + * dp_hash value to group bucket using the Webster method. + * If the caller specifies a non-zero max_hash value, abort and return false + * if more hash values would be required. The absolute maximum number of + * hash values supported is 256. */ + +#define MAX_SELECT_GROUP_HASH_VALUES 256 + +static bool +group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash) +{ + struct ofputil_bucket *bucket; + uint32_t n_buckets = group->up.n_buckets; + double total_weight = 0.0; + uint16_t min_weight = UINT16_MAX; + uint32_t n_hash; + struct webster { + struct ofputil_bucket *bucket; + uint32_t divisor; + double value; + } webster[group->up.n_buckets]; + + if (n_buckets == 0) { + VLOG_DBG(" Don't apply dp_hash method without buckets"); + return false; + } + + int i = 0; + LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { + bucket->aux = 0; + if (bucket->weight > 0 && bucket->weight < min_weight) { + min_weight = bucket->weight; + } + total_weight += bucket->weight; + webster[i].bucket = bucket; + webster[i].divisor = 1; + webster[i].value = bucket->weight; + i++; + } + + VLOG_DBG(" Minimum weight: %d, total weight: %.0f", + min_weight, total_weight); + + uint32_t min_slots = ceil(total_weight / min_weight); + n_hash = MAX(16, 1L << log_2_ceil(min_slots)); + + if (n_hash > MAX_SELECT_GROUP_HASH_VALUES || + (max_hash != 0 && n_hash > max_hash)) { + VLOG_DBG(" Too many hash values required: %d", n_hash); + return false; + } + + VLOG_DBG(" Using %d hash values:", n_hash); + group->hash_mask = n_hash - 1; + if (group->hash_map) { + free(group->hash_map); + } + group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *)); + + /* Use Webster method to distribute hash values over buckets. */ + for (int hash = 0; hash < n_hash; hash++) { + double max_val = 0.0; + struct webster *winner; + for (i = 0; i < n_buckets; i++) { + if (webster[i].value > max_val) { + max_val = webster[i].value; + winner = &webster[i]; + } + } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + /* winner is a reference to a webster[] element initialized above. */ + winner->divisor += 2; + winner->value = (double) winner->bucket->weight / winner->divisor; + group->hash_map[hash] = winner->bucket; + winner->bucket->aux++; +#pragma GCC diagnostic pop + } + + LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { + double target = (n_hash * bucket->weight) / total_weight; + VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d", + bucket->bucket_id, bucket->weight, + target, bucket->aux); + } + + return true; +} + +static void +group_set_selection_method(struct group_dpif *group) +{ + struct ofputil_group_props *props = &group->up.props; + char *selection_method = props->selection_method; + + if (selection_method[0] == '\0') { + VLOG_INFO("No selection method specified."); + group->selection_method = SEL_METHOD_DEFAULT; + + } else if (!strcmp(selection_method, "dp_hash")) { + VLOG_INFO("Selection method specified: dp_hash."); + /* Try to use dp_hash if possible at all. */ + if (group_setup_dp_hash_table(group, 0)) { + group->selection_method = SEL_METHOD_DP_HASH; + group->hash_alg = props->selection_method_param >> 32; + if (group->hash_alg >= __OVS_HASH_MAX) { + VLOG_INFO(" Invalid dp_hash algorithm %d. " + "Defaulting to OVS_HASH_ALG_L4", group->hash_alg); + group->hash_alg = OVS_HASH_ALG_L4; + } + group->hash_basis = (uint32_t) props->selection_method_param; + } else { + /* Fall back to original default hashing in slow path. */ + VLOG_INFO(" Falling back to default hash method."); + group->selection_method = SEL_METHOD_DEFAULT; + } + } else if (!strcmp(selection_method, "hash")) { + VLOG_INFO("Selection method specified: hash."); + if (props->fields.values_size > 0) { + /* Controller has specified hash fields. */ + struct ds s = DS_EMPTY_INITIALIZER; + oxm_format_field_array(&s, &props->fields); + VLOG_INFO(" Hash fields: %s", ds_cstr(&s)); + ds_destroy(&s); + group->selection_method = SEL_METHOD_HASH; + } else { + /* No hash fields. Fall back to original default hashing. */ + VLOG_INFO(" No hash fields. Falling back to default hash method."); + group->selection_method = SEL_METHOD_DEFAULT; + } + } else { + /* Parsing of groups should ensure this never happens */ + OVS_NOT_REACHED(); + } +} + static enum ofperr group_construct(struct ofgroup *group_) { @@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_) ovs_mutex_init_adaptive(&group->stats_mutex); ovs_mutex_lock(&group->stats_mutex); group_construct_stats(group); + group->hash_map = NULL; + if (group->up.type == OFPGT11_SELECT) { + group_set_selection_method(group); + } ovs_mutex_unlock(&group->stats_mutex); return 0; } @@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_) { struct group_dpif *group = group_dpif_cast(group_); ovs_mutex_destroy(&group->stats_mutex); + if (group->hash_map) { + free(group->hash_map); + group->hash_map = NULL; + } } static enum ofperr diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index 47bf7f9..880dd3d 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule) /* Groups. */ +enum group_selection_method { + SEL_METHOD_DEFAULT, + SEL_METHOD_DP_HASH, + SEL_METHOD_HASH, +}; + struct group_dpif { struct ofgroup up; @@ -129,6 +135,12 @@ struct group_dpif { struct ovs_mutex stats_mutex; uint64_t packet_count OVS_GUARDED; /* Number of packets received. */ uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */ + + enum group_selection_method selection_method; + enum ovs_hash_alg hash_alg; /* dp_hash algorithm to be applied. */ + uint32_t hash_basis; /* Basis for dp_hash. */ + uint32_t hash_mask; /* Used to mask dp_hash (2^N - 1).*/ + struct ofputil_bucket **hash_map; /* Map hash values to buckets. */ }; void group_dpif_credit_stats(struct group_dpif *, @@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *, struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, uint32_t group_id, ovs_version_t version, bool take_ref); + /* Backers. * diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 60f28e2..6c20208 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) done -AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl flow-dump from non-dpdk interfaces: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1) -recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 -recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11 ]) AT_CHECK([ovs-appctl revalidator/purge], [0]) @@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) done -AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl flow-dump from non-dpdk interfaces: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2) -recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X +recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X ]) OVS_VSWITCHD_STOP