Message ID | 20230713143810.347542-1-dceara@redhat.com |
---|---|
State | Accepted |
Headers | show |
Series | [ovs-dev] ovn-controller: Detect and use L4_SYM dp-hash if available. | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/github-robot-_ovn-kubernetes | success | github build: passed |
On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote: > Regular dp-hash is not a canonical L4 hash (at least with the netlink > datapath). If the datapath supports l4 symmetrical dp-hash use that one > instead. > > Reported-at: https://github.com/ovn-org/ovn/issues/112 > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 > Signed-off-by: Dumitru Ceara <dceara@redhat.com> > Hi Dumitru, > --- > include/ovn/features.h | 2 ++ > lib/actions.c | 6 ++++++ > lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- > 3 files changed, 47 insertions(+), 10 deletions(-) > > diff --git a/include/ovn/features.h b/include/ovn/features.h > index de8f1f5489..3bf536127f 100644 > --- a/include/ovn/features.h > +++ b/include/ovn/features.h > @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { > OVS_CT_ZERO_SNAT_SUPPORT_BIT, > OVS_DP_METER_SUPPORT_BIT, > OVS_CT_TUPLE_FLUSH_BIT, > + OVS_DP_HASH_L4_SYM_BIT, > }; > > enum ovs_feature_value { > OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), > OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), > OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), > + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), > }; > > void ovs_feature_support_destroy(void); > diff --git a/lib/actions.c b/lib/actions.c > index 037172e606..9d52cb75a8 100644 > --- a/lib/actions.c > +++ b/lib/actions.c > @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select, > struct ds ds = DS_EMPTY_INITIALIZER; > ds_put_format(&ds, "type=select,selection_method=dp_hash"); > > + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { > + /* Select dp-hash l4_symmetric by setting the upper 32bits of > + * selection_method_param to 1: */ > This comment is a bit unfortunate, because it reads like you want to set all bits of the upper half to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to value 1 (1 << 32)." during merge, wdyt? > + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); > + } > + > struct mf_subfield sf = expr_resolve_field(&select->res_field); > > for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) { > diff --git a/lib/features.c b/lib/features.c > index 97c9c86f00..d24e8f6c5c 100644 > --- a/lib/features.c > +++ b/lib/features.c > @@ -21,6 +21,7 @@ > #include "lib/dirs.h" > #include "socket-util.h" > #include "lib/vswitch-idl.h" > +#include "odp-netlink.h" > #include "openvswitch/vlog.h" > #include "openvswitch/ofpbuf.h" > #include "openvswitch/rconn.h" > @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); > > #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 > > +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the > + * type of capability is supported or not. */ > +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities, > + const char *cap_name); > + > struct ovs_feature { > enum ovs_feature_value value; > const char *name; > + ovs_feature_parse_func *parse; > }; > > +static bool > +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) > +{ > + return smap_get_bool(ovs_capabilities, cap_name, false); > +} > + > +static bool > +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, > + const char *cap_name OVS_UNUSED) > +{ > + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0); > + > + return max_hash_alg == OVS_HASH_ALG_SYM_L4; > +} > + > static struct ovs_feature all_ovs_features[] = { > { > .value = OVS_CT_ZERO_SNAT_SUPPORT, > - .name = "ct_zero_snat" > + .name = "ct_zero_snat", > + .parse = bool_parser, > }, > { > .value = OVS_CT_TUPLE_FLUSH_SUPPORT, > - .name = "ct_flush" > - } > + .name = "ct_flush", > + .parse = bool_parser, > + }, > + { > + .value = OVS_DP_HASH_L4_SYM_SUPPORT, > + .name = "dp_hash_l4_sym_support", > + .parse = dp_hash_l4_sym_support_parser, > + }, > }; > > /* A bitmap of OVS features that have been detected as 'supported'. */ > @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) > case OVS_CT_ZERO_SNAT_SUPPORT: > case OVS_DP_METER_SUPPORT: > case OVS_CT_TUPLE_FLUSH_SUPPORT: > + case OVS_DP_HASH_L4_SYM_SUPPORT: > return true; > default: > return false; > @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap > *ovs_capabilities, > } > > for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { > - enum ovs_feature_value value = all_ovs_features[i].value; > - const char *name = all_ovs_features[i].name; > - bool old_state = supported_ovs_features & value; > - bool new_state = smap_get_bool(ovs_capabilities, name, false); > + struct ovs_feature *feature = &all_ovs_features[i]; > + bool old_state = supported_ovs_features & feature->value; > + bool new_state = feature->parse(ovs_capabilities, feature->name); > if (new_state != old_state) { > updated = true; > if (new_state) { > - supported_ovs_features |= value; > + supported_ovs_features |= feature->value; > } else { > - supported_ovs_features &= ~value; > + supported_ovs_features &= ~feature->value; > } > - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, > + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name, > new_state ? "supported" : "not supported"); > } > } > -- > 2.31.1 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > Other than that it looks good. Acked-by: Ales Musil <amusil@redhat.com>
On 7/14/23 08:41, Ales Musil wrote: > On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote: > >> Regular dp-hash is not a canonical L4 hash (at least with the netlink >> datapath). If the datapath supports l4 symmetrical dp-hash use that one >> instead. >> >> Reported-at: https://github.com/ovn-org/ovn/issues/112 >> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >> > > Hi Dumitru, > > >> --- >> include/ovn/features.h | 2 ++ >> lib/actions.c | 6 ++++++ >> lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- >> 3 files changed, 47 insertions(+), 10 deletions(-) >> >> diff --git a/include/ovn/features.h b/include/ovn/features.h >> index de8f1f5489..3bf536127f 100644 >> --- a/include/ovn/features.h >> +++ b/include/ovn/features.h >> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >> OVS_DP_METER_SUPPORT_BIT, >> OVS_CT_TUPLE_FLUSH_BIT, >> + OVS_DP_HASH_L4_SYM_BIT, >> }; >> >> enum ovs_feature_value { >> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), >> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >> }; >> >> void ovs_feature_support_destroy(void); >> diff --git a/lib/actions.c b/lib/actions.c >> index 037172e606..9d52cb75a8 100644 >> --- a/lib/actions.c >> +++ b/lib/actions.c >> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select, >> struct ds ds = DS_EMPTY_INITIALIZER; >> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >> >> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >> + /* Select dp-hash l4_symmetric by setting the upper 32bits of >> + * selection_method_param to 1: */ >> > > This comment is a bit unfortunate, because it reads like you want to set > all bits of the upper half > to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to value > 1 (1 << 32)." during merge, wdyt? > > Makes sense, thanks for the suggestion! I changed it accordingly. >> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >> + } >> + >> struct mf_subfield sf = expr_resolve_field(&select->res_field); >> >> for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) { >> diff --git a/lib/features.c b/lib/features.c >> index 97c9c86f00..d24e8f6c5c 100644 >> --- a/lib/features.c >> +++ b/lib/features.c >> @@ -21,6 +21,7 @@ >> #include "lib/dirs.h" >> #include "socket-util.h" >> #include "lib/vswitch-idl.h" >> +#include "odp-netlink.h" >> #include "openvswitch/vlog.h" >> #include "openvswitch/ofpbuf.h" >> #include "openvswitch/rconn.h" >> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >> >> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >> >> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the >> + * type of capability is supported or not. */ >> +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities, >> + const char *cap_name); >> + >> struct ovs_feature { >> enum ovs_feature_value value; >> const char *name; >> + ovs_feature_parse_func *parse; >> }; >> >> +static bool >> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) >> +{ >> + return smap_get_bool(ovs_capabilities, cap_name, false); >> +} >> + >> +static bool >> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >> + const char *cap_name OVS_UNUSED) >> +{ >> + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0); >> + >> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >> +} >> + >> static struct ovs_feature all_ovs_features[] = { >> { >> .value = OVS_CT_ZERO_SNAT_SUPPORT, >> - .name = "ct_zero_snat" >> + .name = "ct_zero_snat", >> + .parse = bool_parser, >> }, >> { >> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >> - .name = "ct_flush" >> - } >> + .name = "ct_flush", >> + .parse = bool_parser, >> + }, >> + { >> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >> + .name = "dp_hash_l4_sym_support", >> + .parse = dp_hash_l4_sym_support_parser, >> + }, >> }; >> >> /* A bitmap of OVS features that have been detected as 'supported'. */ >> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) >> case OVS_CT_ZERO_SNAT_SUPPORT: >> case OVS_DP_METER_SUPPORT: >> case OVS_CT_TUPLE_FLUSH_SUPPORT: >> + case OVS_DP_HASH_L4_SYM_SUPPORT: >> return true; >> default: >> return false; >> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >> *ovs_capabilities, >> } >> >> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >> - enum ovs_feature_value value = all_ovs_features[i].value; >> - const char *name = all_ovs_features[i].name; >> - bool old_state = supported_ovs_features & value; >> - bool new_state = smap_get_bool(ovs_capabilities, name, false); >> + struct ovs_feature *feature = &all_ovs_features[i]; >> + bool old_state = supported_ovs_features & feature->value; >> + bool new_state = feature->parse(ovs_capabilities, feature->name); >> if (new_state != old_state) { >> updated = true; >> if (new_state) { >> - supported_ovs_features |= value; >> + supported_ovs_features |= feature->value; >> } else { >> - supported_ovs_features &= ~value; >> + supported_ovs_features &= ~feature->value; >> } >> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name, >> new_state ? "supported" : "not supported"); >> } >> } >> -- >> 2.31.1 >> >> _______________________________________________ >> dev mailing list >> dev@openvswitch.org >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> >> > Other than that it looks good. > > Acked-by: Ales Musil <amusil@redhat.com> > Thanks, I applied this to main. I was wondering if we should backport this though. It's not exactly a bug fix but it does avoid using a broken datapath feature (when possible) so it avoids a broken behavior. The changes are quite contained. CC-ing some of the maintainers explicitly to get some more eyes on this. Regards, Dumitru
On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote: > > On 7/14/23 08:41, Ales Musil wrote: > > On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote: > > > >> Regular dp-hash is not a canonical L4 hash (at least with the netlink > >> datapath). If the datapath supports l4 symmetrical dp-hash use that one > >> instead. > >> > >> Reported-at: https://github.com/ovn-org/ovn/issues/112 > >> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 > >> Signed-off-by: Dumitru Ceara <dceara@redhat.com> > >> > > > > Hi Dumitru, > > > > > >> --- > >> include/ovn/features.h | 2 ++ > >> lib/actions.c | 6 ++++++ > >> lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- > >> 3 files changed, 47 insertions(+), 10 deletions(-) > >> > >> diff --git a/include/ovn/features.h b/include/ovn/features.h > >> index de8f1f5489..3bf536127f 100644 > >> --- a/include/ovn/features.h > >> +++ b/include/ovn/features.h > >> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { > >> OVS_CT_ZERO_SNAT_SUPPORT_BIT, > >> OVS_DP_METER_SUPPORT_BIT, > >> OVS_CT_TUPLE_FLUSH_BIT, > >> + OVS_DP_HASH_L4_SYM_BIT, > >> }; > >> > >> enum ovs_feature_value { > >> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), > >> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), > >> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), > >> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), > >> }; > >> > >> void ovs_feature_support_destroy(void); > >> diff --git a/lib/actions.c b/lib/actions.c > >> index 037172e606..9d52cb75a8 100644 > >> --- a/lib/actions.c > >> +++ b/lib/actions.c > >> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select, > >> struct ds ds = DS_EMPTY_INITIALIZER; > >> ds_put_format(&ds, "type=select,selection_method=dp_hash"); > >> > >> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { > >> + /* Select dp-hash l4_symmetric by setting the upper 32bits of > >> + * selection_method_param to 1: */ > >> > > > > This comment is a bit unfortunate, because it reads like you want to set > > all bits of the upper half > > to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to value > > 1 (1 << 32)." during merge, wdyt? > > > > > > Makes sense, thanks for the suggestion! I changed it accordingly. > > >> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); > >> + } > >> + > >> struct mf_subfield sf = expr_resolve_field(&select->res_field); > >> > >> for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) { > >> diff --git a/lib/features.c b/lib/features.c > >> index 97c9c86f00..d24e8f6c5c 100644 > >> --- a/lib/features.c > >> +++ b/lib/features.c > >> @@ -21,6 +21,7 @@ > >> #include "lib/dirs.h" > >> #include "socket-util.h" > >> #include "lib/vswitch-idl.h" > >> +#include "odp-netlink.h" > >> #include "openvswitch/vlog.h" > >> #include "openvswitch/ofpbuf.h" > >> #include "openvswitch/rconn.h" > >> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); > >> > >> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 > >> > >> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the > >> + * type of capability is supported or not. */ > >> +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities, > >> + const char *cap_name); > >> + > >> struct ovs_feature { > >> enum ovs_feature_value value; > >> const char *name; > >> + ovs_feature_parse_func *parse; > >> }; > >> > >> +static bool > >> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) > >> +{ > >> + return smap_get_bool(ovs_capabilities, cap_name, false); > >> +} > >> + > >> +static bool > >> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, > >> + const char *cap_name OVS_UNUSED) > >> +{ > >> + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0); > >> + > >> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; > >> +} > >> + > >> static struct ovs_feature all_ovs_features[] = { > >> { > >> .value = OVS_CT_ZERO_SNAT_SUPPORT, > >> - .name = "ct_zero_snat" > >> + .name = "ct_zero_snat", > >> + .parse = bool_parser, > >> }, > >> { > >> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, > >> - .name = "ct_flush" > >> - } > >> + .name = "ct_flush", > >> + .parse = bool_parser, > >> + }, > >> + { > >> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, > >> + .name = "dp_hash_l4_sym_support", > >> + .parse = dp_hash_l4_sym_support_parser, > >> + }, > >> }; > >> > >> /* A bitmap of OVS features that have been detected as 'supported'. */ > >> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) > >> case OVS_CT_ZERO_SNAT_SUPPORT: > >> case OVS_DP_METER_SUPPORT: > >> case OVS_CT_TUPLE_FLUSH_SUPPORT: > >> + case OVS_DP_HASH_L4_SYM_SUPPORT: > >> return true; > >> default: > >> return false; > >> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap > >> *ovs_capabilities, > >> } > >> > >> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { > >> - enum ovs_feature_value value = all_ovs_features[i].value; > >> - const char *name = all_ovs_features[i].name; > >> - bool old_state = supported_ovs_features & value; > >> - bool new_state = smap_get_bool(ovs_capabilities, name, false); > >> + struct ovs_feature *feature = &all_ovs_features[i]; > >> + bool old_state = supported_ovs_features & feature->value; > >> + bool new_state = feature->parse(ovs_capabilities, feature->name); > >> if (new_state != old_state) { > >> updated = true; > >> if (new_state) { > >> - supported_ovs_features |= value; > >> + supported_ovs_features |= feature->value; > >> } else { > >> - supported_ovs_features &= ~value; > >> + supported_ovs_features &= ~feature->value; > >> } > >> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, > >> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name, > >> new_state ? "supported" : "not supported"); > >> } > >> } > >> -- > >> 2.31.1 > >> > >> _______________________________________________ > >> dev mailing list > >> dev@openvswitch.org > >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > >> > >> > > Other than that it looks good. > > > > Acked-by: Ales Musil <amusil@redhat.com> > > > > Thanks, I applied this to main. > > I was wondering if we should backport this though. It's not exactly a > bug fix but it does avoid using a broken datapath feature (when > possible) so it avoids a broken behavior. > > The changes are quite contained. > > CC-ing some of the maintainers explicitly to get some more eyes on this. > Thanks Dumitru. Sorry that I was following the reported issue. I tried to look for some more details, but I am not authorized to access bug 2175716. It is not quite clear to me what the impact of this problem is. The OVN issue (112) seems to be a very common configuration of OVN ECMP and I assume the problem is not easily reproduced because I have been testing/deploying the feature and it worked as expected. Do you have more details about the trigger? Are there any related fixes in OVS and kernel? Regards, Han > Regards, > Dumitru > > > >
On 7/18/23 12:14, Han Zhou wrote: > On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote: >> >> On 7/14/23 08:41, Ales Musil wrote: >>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote: >>> >>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink >>>> datapath). If the datapath supports l4 symmetrical dp-hash use that > one >>>> instead. >>>> >>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 >>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >>>> >>> >>> Hi Dumitru, >>> >>> >>>> --- >>>> include/ovn/features.h | 2 ++ >>>> lib/actions.c | 6 ++++++ >>>> lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- >>>> 3 files changed, 47 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/include/ovn/features.h b/include/ovn/features.h >>>> index de8f1f5489..3bf536127f 100644 >>>> --- a/include/ovn/features.h >>>> +++ b/include/ovn/features.h >>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >>>> OVS_DP_METER_SUPPORT_BIT, >>>> OVS_CT_TUPLE_FLUSH_BIT, >>>> + OVS_DP_HASH_L4_SYM_BIT, >>>> }; >>>> >>>> enum ovs_feature_value { >>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), >>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >>>> }; >>>> >>>> void ovs_feature_support_destroy(void); >>>> diff --git a/lib/actions.c b/lib/actions.c >>>> index 037172e606..9d52cb75a8 100644 >>>> --- a/lib/actions.c >>>> +++ b/lib/actions.c >>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select > *select, >>>> struct ds ds = DS_EMPTY_INITIALIZER; >>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >>>> >>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits of >>>> + * selection_method_param to 1: */ >>>> >>> >>> This comment is a bit unfortunate, because it reads like you want to set >>> all bits of the upper half >>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to > value >>> 1 (1 << 32)." during merge, wdyt? >>> >>> >> >> Makes sense, thanks for the suggestion! I changed it accordingly. >> >>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >>>> + } >>>> + >>>> struct mf_subfield sf = expr_resolve_field(&select->res_field); >>>> >>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; > bucket_id++) { >>>> diff --git a/lib/features.c b/lib/features.c >>>> index 97c9c86f00..d24e8f6c5c 100644 >>>> --- a/lib/features.c >>>> +++ b/lib/features.c >>>> @@ -21,6 +21,7 @@ >>>> #include "lib/dirs.h" >>>> #include "socket-util.h" >>>> #include "lib/vswitch-idl.h" >>>> +#include "odp-netlink.h" >>>> #include "openvswitch/vlog.h" >>>> #include "openvswitch/ofpbuf.h" >>>> #include "openvswitch/rconn.h" >>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >>>> >>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >>>> >>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the >>>> + * type of capability is supported or not. */ >>>> +typedef bool ovs_feature_parse_func(const struct smap > *ovs_capabilities, >>>> + const char *cap_name); >>>> + >>>> struct ovs_feature { >>>> enum ovs_feature_value value; >>>> const char *name; >>>> + ovs_feature_parse_func *parse; >>>> }; >>>> >>>> +static bool >>>> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) >>>> +{ >>>> + return smap_get_bool(ovs_capabilities, cap_name, false); >>>> +} >>>> + >>>> +static bool >>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >>>> + const char *cap_name OVS_UNUSED) >>>> +{ >>>> + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", > 0); >>>> + >>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >>>> +} >>>> + >>>> static struct ovs_feature all_ovs_features[] = { >>>> { >>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, >>>> - .name = "ct_zero_snat" >>>> + .name = "ct_zero_snat", >>>> + .parse = bool_parser, >>>> }, >>>> { >>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >>>> - .name = "ct_flush" >>>> - } >>>> + .name = "ct_flush", >>>> + .parse = bool_parser, >>>> + }, >>>> + { >>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >>>> + .name = "dp_hash_l4_sym_support", >>>> + .parse = dp_hash_l4_sym_support_parser, >>>> + }, >>>> }; >>>> >>>> /* A bitmap of OVS features that have been detected as 'supported'. */ >>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) >>>> case OVS_CT_ZERO_SNAT_SUPPORT: >>>> case OVS_DP_METER_SUPPORT: >>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: >>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: >>>> return true; >>>> default: >>>> return false; >>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >>>> *ovs_capabilities, >>>> } >>>> >>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >>>> - enum ovs_feature_value value = all_ovs_features[i].value; >>>> - const char *name = all_ovs_features[i].name; >>>> - bool old_state = supported_ovs_features & value; >>>> - bool new_state = smap_get_bool(ovs_capabilities, name, false); >>>> + struct ovs_feature *feature = &all_ovs_features[i]; >>>> + bool old_state = supported_ovs_features & feature->value; >>>> + bool new_state = feature->parse(ovs_capabilities, > feature->name); >>>> if (new_state != old_state) { >>>> updated = true; >>>> if (new_state) { >>>> - supported_ovs_features |= value; >>>> + supported_ovs_features |= feature->value; >>>> } else { >>>> - supported_ovs_features &= ~value; >>>> + supported_ovs_features &= ~feature->value; >>>> } >>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", > feature->name, >>>> new_state ? "supported" : "not supported"); >>>> } >>>> } >>>> -- >>>> 2.31.1 >>>> >>>> _______________________________________________ >>>> dev mailing list >>>> dev@openvswitch.org >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>> >>>> >>> Other than that it looks good. >>> >>> Acked-by: Ales Musil <amusil@redhat.com> >>> >> >> Thanks, I applied this to main. >> >> I was wondering if we should backport this though. It's not exactly a >> bug fix but it does avoid using a broken datapath feature (when >> possible) so it avoids a broken behavior. >> >> The changes are quite contained. >> >> CC-ing some of the maintainers explicitly to get some more eyes on this. >> > Thanks Dumitru. Sorry that I was following the reported issue. I tried to > look for some more details, but I am not authorized to access bug 2175716. Sorry about that, that bug used to be public but some internal policies made it private. > It is not quite clear to me what the impact of this problem is. The OVN > issue (112) seems to be a very common configuration of OVN ECMP and I > assume the problem is not easily reproduced because I have been > testing/deploying the feature and it worked as expected. Do you have more > details about the trigger? Are there any related fixes in OVS and kernel? > The original bug report came from OpenShift: https://issues.redhat.com/browse/OCPBUGS-7406 CC-ing Patryk as he root caused the original issue. The simplified topology is: +--- GW1 ---+ client ---- cluster-router ---+ +--- (out of OVN) --- server +--- GW2 ---+ On cluster-router there's an ECMP route to reach 'server' via: - GW1 - GW2 On GW1 traffic going out of OVN is SNATed to IP1. On GW2 traffic going out of OVN is SNATed to IP2. The client initiates a TCP session towards the server; a path is chosen based on dp-hash, e.g. GW2. From this point on, during normal operation, traffic on this session flows fine via GW2. If the client closes the connection, then for the closing connection there's an exception when computing the dp-hash if the socket is in state TIME_WAIT, substate TCP_FIN_WAIT_2: https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 In this last case we call: tcp_v4_timewait_ack(sk, skb) | tcp_v4_send_ack(sk, skb, ...) The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet: https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 ctl_sk = this_cpu_read(ipv4_tcp_sk); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, transmit_time); As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is computed from the packet contents: https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; } [...] void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; u32 hash; __flow_hash_secret_init(); hash = ___skb_get_hash(skb, &keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } But the original socket's txhash was actually a random hash: https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } This means that the last ack might enter the ovs datapath with a different dp_hash than all other packets sent on that session in that direction. It's possible that this last ack gets hashed to a different path of the ECMP route. This specific issue was fixed in the kernel through: https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 However, while debugging we realized the problem is actually visible with _any_ retransmit. That's because a retransmit will trigger a rehash: https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); [...] So suddenly all subsequent packets (skbs) will have a different 'hash' value and might get hashed to a different path. This last part was not fixed. After a discussion on the netdev mailing list the conclusion was to not change the hash behavior for locally terminated TCP sessions: https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd Eventually the documentation change patch was dropped and a v2 of the other fixes was accepted: https://www.spinics.net/lists/netdev/msg906252.html The rehash on retransmit problem is still present though making the kernel's dp-hash implementation unusable. During some internal discussions Aaron (in CC) pointed out that OVS supports L4 symmetric hashing and that should be a stable L4 hash. OVS already reports the maximum supported hashing algorithm through the Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was that the kernel datapath didn't support L4 symmetric hashing. But Aaron posted a patch to implement that and the patch got accepted: https://www.spinics.net/lists/netdev/msg911045.html Then we moved OVN to use L4 symmetric hashing if available. That's to ensure correctness of the packet processing. We did run performance tests and the hit due to the extra hashing was acceptable, max 3% throughput degradation in OVN scenarios that simulate common traffic patterns (like ovn-kubernetes ones) - in most cases the performance degradation was around 2%. Sorry for the long story but I hope it provides all the required context. Thinking more about it I should've probably added some of this to the original commit log, I apologize. Regards, Dumitru
On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: > > On 7/18/23 12:14, Han Zhou wrote: > > On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote: > >> > >> On 7/14/23 08:41, Ales Musil wrote: > >>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote: > >>> > >>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink > >>>> datapath). If the datapath supports l4 symmetrical dp-hash use that > > one > >>>> instead. > >>>> > >>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 > >>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 > >>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> > >>>> > >>> > >>> Hi Dumitru, > >>> > >>> > >>>> --- > >>>> include/ovn/features.h | 2 ++ > >>>> lib/actions.c | 6 ++++++ > >>>> lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- > >>>> 3 files changed, 47 insertions(+), 10 deletions(-) > >>>> > >>>> diff --git a/include/ovn/features.h b/include/ovn/features.h > >>>> index de8f1f5489..3bf536127f 100644 > >>>> --- a/include/ovn/features.h > >>>> +++ b/include/ovn/features.h > >>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { > >>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, > >>>> OVS_DP_METER_SUPPORT_BIT, > >>>> OVS_CT_TUPLE_FLUSH_BIT, > >>>> + OVS_DP_HASH_L4_SYM_BIT, > >>>> }; > >>>> > >>>> enum ovs_feature_value { > >>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), > >>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), > >>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), > >>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), > >>>> }; > >>>> > >>>> void ovs_feature_support_destroy(void); > >>>> diff --git a/lib/actions.c b/lib/actions.c > >>>> index 037172e606..9d52cb75a8 100644 > >>>> --- a/lib/actions.c > >>>> +++ b/lib/actions.c > >>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select > > *select, > >>>> struct ds ds = DS_EMPTY_INITIALIZER; > >>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); > >>>> > >>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { > >>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits of > >>>> + * selection_method_param to 1: */ > >>>> > >>> > >>> This comment is a bit unfortunate, because it reads like you want to set > >>> all bits of the upper half > >>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to > > value > >>> 1 (1 << 32)." during merge, wdyt? > >>> > >>> > >> > >> Makes sense, thanks for the suggestion! I changed it accordingly. > >> > >>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); > >>>> + } > >>>> + > >>>> struct mf_subfield sf = expr_resolve_field(&select->res_field); > >>>> > >>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; > > bucket_id++) { > >>>> diff --git a/lib/features.c b/lib/features.c > >>>> index 97c9c86f00..d24e8f6c5c 100644 > >>>> --- a/lib/features.c > >>>> +++ b/lib/features.c > >>>> @@ -21,6 +21,7 @@ > >>>> #include "lib/dirs.h" > >>>> #include "socket-util.h" > >>>> #include "lib/vswitch-idl.h" > >>>> +#include "odp-netlink.h" > >>>> #include "openvswitch/vlog.h" > >>>> #include "openvswitch/ofpbuf.h" > >>>> #include "openvswitch/rconn.h" > >>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); > >>>> > >>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 > >>>> > >>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the > >>>> + * type of capability is supported or not. */ > >>>> +typedef bool ovs_feature_parse_func(const struct smap > > *ovs_capabilities, > >>>> + const char *cap_name); > >>>> + > >>>> struct ovs_feature { > >>>> enum ovs_feature_value value; > >>>> const char *name; > >>>> + ovs_feature_parse_func *parse; > >>>> }; > >>>> > >>>> +static bool > >>>> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) > >>>> +{ > >>>> + return smap_get_bool(ovs_capabilities, cap_name, false); > >>>> +} > >>>> + > >>>> +static bool > >>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, > >>>> + const char *cap_name OVS_UNUSED) > >>>> +{ > >>>> + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", > > 0); > >>>> + > >>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; > >>>> +} > >>>> + > >>>> static struct ovs_feature all_ovs_features[] = { > >>>> { > >>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, > >>>> - .name = "ct_zero_snat" > >>>> + .name = "ct_zero_snat", > >>>> + .parse = bool_parser, > >>>> }, > >>>> { > >>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, > >>>> - .name = "ct_flush" > >>>> - } > >>>> + .name = "ct_flush", > >>>> + .parse = bool_parser, > >>>> + }, > >>>> + { > >>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, > >>>> + .name = "dp_hash_l4_sym_support", > >>>> + .parse = dp_hash_l4_sym_support_parser, > >>>> + }, > >>>> }; > >>>> > >>>> /* A bitmap of OVS features that have been detected as 'supported'. */ > >>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) > >>>> case OVS_CT_ZERO_SNAT_SUPPORT: > >>>> case OVS_DP_METER_SUPPORT: > >>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: > >>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: > >>>> return true; > >>>> default: > >>>> return false; > >>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap > >>>> *ovs_capabilities, > >>>> } > >>>> > >>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { > >>>> - enum ovs_feature_value value = all_ovs_features[i].value; > >>>> - const char *name = all_ovs_features[i].name; > >>>> - bool old_state = supported_ovs_features & value; > >>>> - bool new_state = smap_get_bool(ovs_capabilities, name, false); > >>>> + struct ovs_feature *feature = &all_ovs_features[i]; > >>>> + bool old_state = supported_ovs_features & feature->value; > >>>> + bool new_state = feature->parse(ovs_capabilities, > > feature->name); > >>>> if (new_state != old_state) { > >>>> updated = true; > >>>> if (new_state) { > >>>> - supported_ovs_features |= value; > >>>> + supported_ovs_features |= feature->value; > >>>> } else { > >>>> - supported_ovs_features &= ~value; > >>>> + supported_ovs_features &= ~feature->value; > >>>> } > >>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, > >>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", > > feature->name, > >>>> new_state ? "supported" : "not supported"); > >>>> } > >>>> } > >>>> -- > >>>> 2.31.1 > >>>> > >>>> _______________________________________________ > >>>> dev mailing list > >>>> dev@openvswitch.org > >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > >>>> > >>>> > >>> Other than that it looks good. > >>> > >>> Acked-by: Ales Musil <amusil@redhat.com> > >>> > >> > >> Thanks, I applied this to main. > >> > >> I was wondering if we should backport this though. It's not exactly a > >> bug fix but it does avoid using a broken datapath feature (when > >> possible) so it avoids a broken behavior. > >> > >> The changes are quite contained. > >> > >> CC-ing some of the maintainers explicitly to get some more eyes on this. > >> > > Thanks Dumitru. Sorry that I was following the reported issue. I tried to > > look for some more details, but I am not authorized to access bug 2175716. > > Sorry about that, that bug used to be public but some internal policies > made it private. > > > It is not quite clear to me what the impact of this problem is. The OVN > > issue (112) seems to be a very common configuration of OVN ECMP and I > > assume the problem is not easily reproduced because I have been > > testing/deploying the feature and it worked as expected. Do you have more > > details about the trigger? Are there any related fixes in OVS and kernel? > > > > The original bug report came from OpenShift: > https://issues.redhat.com/browse/OCPBUGS-7406 > > CC-ing Patryk as he root caused the original issue. > > The simplified topology is: > +--- GW1 ---+ > client ---- cluster-router ---+ +--- (out of OVN) --- server > +--- GW2 ---+ > > On cluster-router there's an ECMP route to reach 'server' via: > - GW1 > - GW2 > > On GW1 traffic going out of OVN is SNATed to IP1. > On GW2 traffic going out of OVN is SNATed to IP2. > > The client initiates a TCP session towards the server; a path is chosen > based on dp-hash, e.g. GW2. > > From this point on, during normal operation, traffic on this session > flows fine via GW2. > > If the client closes the connection, then for the closing connection > there's an exception when computing the dp-hash if the socket is in > state TIME_WAIT, substate TCP_FIN_WAIT_2: > https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 > In this last case we call: > > tcp_v4_timewait_ack(sk, skb) > | > tcp_v4_send_ack(sk, skb, ...) > > The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet: > https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 > > ctl_sk = this_cpu_read(ipv4_tcp_sk); > sock_net_set(ctl_sk, net); > ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? > inet_twsk(sk)->tw_mark : sk->sk_mark; > ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? > inet_twsk(sk)->tw_priority : sk->sk_priority; > transmit_time = tcp_transmit_time(sk); > ip_send_unicast_reply(ctl_sk, > skb, &TCP_SKB_CB(skb)->header.h4.opt, > ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, > &arg, arg.iov[0].iov_len, > transmit_time); > > As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is > computed from the packet contents: > https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 > > static inline __u32 skb_get_hash(struct sk_buff *skb) > { > if (!skb->l4_hash && !skb->sw_hash) > __skb_get_hash(skb); > > return skb->hash; > } > [...] > void __skb_get_hash(struct sk_buff *skb) > { > struct flow_keys keys; > u32 hash; > > __flow_hash_secret_init(); > > hash = ___skb_get_hash(skb, &keys, &hashrnd); > > __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); > } > > But the original socket's txhash was actually a random hash: > https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 > > static inline void sk_set_txhash(struct sock *sk) > { > /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ > WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); > } > > This means that the last ack might enter the ovs datapath with a > different dp_hash than all other packets sent on that session in that > direction. It's possible that this last ack gets hashed to a different > path of the ECMP route. > > This specific issue was fixed in the kernel through: > > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 > > However, while debugging we realized the problem is actually visible > with _any_ retransmit. That's because a retransmit will trigger a rehash: > > https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 > > int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) > { > const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; > struct flowi fl; > int res; > > /* Paired with WRITE_ONCE() in sock_setsockopt() */ > if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) > tcp_rsk(req)->txhash = net_tx_rndhash(); > res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, > NULL); > [...] > > So suddenly all subsequent packets (skbs) will have a different 'hash' > value and might get hashed to a different path. > > This last part was not fixed. After a discussion on the netdev mailing > list the conclusion was to not change the hash behavior for locally > terminated TCP sessions: > > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd > > Eventually the documentation change patch was dropped and a v2 of the > other fixes was accepted: > https://www.spinics.net/lists/netdev/msg906252.html > > The rehash on retransmit problem is still present though making the > kernel's dp-hash implementation unusable. > > During some internal discussions Aaron (in CC) pointed out that OVS > supports L4 symmetric hashing and that should be a stable L4 hash. OVS > already reports the maximum supported hashing algorithm through the > Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was > that the kernel datapath didn't support L4 symmetric hashing. But Aaron > posted a patch to implement that and the patch got accepted: > > https://www.spinics.net/lists/netdev/msg911045.html > > Then we moved OVN to use L4 symmetric hashing if available. That's to > ensure correctness of the packet processing. We did run performance > tests and the hit due to the extra hashing was acceptable, max 3% > throughput degradation in OVN scenarios that simulate common traffic > patterns (like ovn-kubernetes ones) - in most cases the performance > degradation was around 2%. > > Sorry for the long story but I hope it provides all the required > context. Thinking more about it I should've probably added some of this > to the original commit log, I apologize. > > Regards, > Dumitru > Thanks Dumitru for the details! It is now quite clear and I am not against backporting it. But I have a question regarding upgrading. For the existing traffic, does this mean the packets may choose a different path after OVN upgrading? Should this be called out? Regards, Han
On 8/1/23 12:51, Han Zhou wrote: > On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: >> >> On 7/18/23 12:14, Han Zhou wrote: >>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote: >>>> >>>> On 7/14/23 08:41, Ales Musil wrote: >>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> > wrote: >>>>> >>>>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink >>>>>> datapath). If the datapath supports l4 symmetrical dp-hash use that >>> one >>>>>> instead. >>>>>> >>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 >>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >>>>>> >>>>> >>>>> Hi Dumitru, >>>>> >>>>> >>>>>> --- >>>>>> include/ovn/features.h | 2 ++ >>>>>> lib/actions.c | 6 ++++++ >>>>>> lib/features.c | 49 > +++++++++++++++++++++++++++++++++--------- >>>>>> 3 files changed, 47 insertions(+), 10 deletions(-) >>>>>> >>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h >>>>>> index de8f1f5489..3bf536127f 100644 >>>>>> --- a/include/ovn/features.h >>>>>> +++ b/include/ovn/features.h >>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >>>>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >>>>>> OVS_DP_METER_SUPPORT_BIT, >>>>>> OVS_CT_TUPLE_FLUSH_BIT, >>>>>> + OVS_DP_HASH_L4_SYM_BIT, >>>>>> }; >>>>>> >>>>>> enum ovs_feature_value { >>>>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), >>>>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >>>>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >>>>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >>>>>> }; >>>>>> >>>>>> void ovs_feature_support_destroy(void); >>>>>> diff --git a/lib/actions.c b/lib/actions.c >>>>>> index 037172e606..9d52cb75a8 100644 >>>>>> --- a/lib/actions.c >>>>>> +++ b/lib/actions.c >>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select >>> *select, >>>>>> struct ds ds = DS_EMPTY_INITIALIZER; >>>>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >>>>>> >>>>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >>>>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits > of >>>>>> + * selection_method_param to 1: */ >>>>>> >>>>> >>>>> This comment is a bit unfortunate, because it reads like you want to > set >>>>> all bits of the upper half >>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to >>> value >>>>> 1 (1 << 32)." during merge, wdyt? >>>>> >>>>> >>>> >>>> Makes sense, thanks for the suggestion! I changed it accordingly. >>>> >>>>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >>>>>> + } >>>>>> + >>>>>> struct mf_subfield sf = expr_resolve_field(&select->res_field); >>>>>> >>>>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; >>> bucket_id++) { >>>>>> diff --git a/lib/features.c b/lib/features.c >>>>>> index 97c9c86f00..d24e8f6c5c 100644 >>>>>> --- a/lib/features.c >>>>>> +++ b/lib/features.c >>>>>> @@ -21,6 +21,7 @@ >>>>>> #include "lib/dirs.h" >>>>>> #include "socket-util.h" >>>>>> #include "lib/vswitch-idl.h" >>>>>> +#include "odp-netlink.h" >>>>>> #include "openvswitch/vlog.h" >>>>>> #include "openvswitch/ofpbuf.h" >>>>>> #include "openvswitch/rconn.h" >>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >>>>>> >>>>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >>>>>> >>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the >>>>>> + * type of capability is supported or not. */ >>>>>> +typedef bool ovs_feature_parse_func(const struct smap >>> *ovs_capabilities, >>>>>> + const char *cap_name); >>>>>> + >>>>>> struct ovs_feature { >>>>>> enum ovs_feature_value value; >>>>>> const char *name; >>>>>> + ovs_feature_parse_func *parse; >>>>>> }; >>>>>> >>>>>> +static bool >>>>>> +bool_parser(const struct smap *ovs_capabilities, const char > *cap_name) >>>>>> +{ >>>>>> + return smap_get_bool(ovs_capabilities, cap_name, false); >>>>>> +} >>>>>> + >>>>>> +static bool >>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >>>>>> + const char *cap_name OVS_UNUSED) >>>>>> +{ >>>>>> + int max_hash_alg = smap_get_int(ovs_capabilities, > "max_hash_alg", >>> 0); >>>>>> + >>>>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >>>>>> +} >>>>>> + >>>>>> static struct ovs_feature all_ovs_features[] = { >>>>>> { >>>>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, >>>>>> - .name = "ct_zero_snat" >>>>>> + .name = "ct_zero_snat", >>>>>> + .parse = bool_parser, >>>>>> }, >>>>>> { >>>>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >>>>>> - .name = "ct_flush" >>>>>> - } >>>>>> + .name = "ct_flush", >>>>>> + .parse = bool_parser, >>>>>> + }, >>>>>> + { >>>>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >>>>>> + .name = "dp_hash_l4_sym_support", >>>>>> + .parse = dp_hash_l4_sym_support_parser, >>>>>> + }, >>>>>> }; >>>>>> >>>>>> /* A bitmap of OVS features that have been detected as 'supported'. > */ >>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value > feature) >>>>>> case OVS_CT_ZERO_SNAT_SUPPORT: >>>>>> case OVS_DP_METER_SUPPORT: >>>>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: >>>>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: >>>>>> return true; >>>>>> default: >>>>>> return false; >>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >>>>>> *ovs_capabilities, >>>>>> } >>>>>> >>>>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >>>>>> - enum ovs_feature_value value = all_ovs_features[i].value; >>>>>> - const char *name = all_ovs_features[i].name; >>>>>> - bool old_state = supported_ovs_features & value; >>>>>> - bool new_state = smap_get_bool(ovs_capabilities, name, > false); >>>>>> + struct ovs_feature *feature = &all_ovs_features[i]; >>>>>> + bool old_state = supported_ovs_features & feature->value; >>>>>> + bool new_state = feature->parse(ovs_capabilities, >>> feature->name); >>>>>> if (new_state != old_state) { >>>>>> updated = true; >>>>>> if (new_state) { >>>>>> - supported_ovs_features |= value; >>>>>> + supported_ovs_features |= feature->value; >>>>>> } else { >>>>>> - supported_ovs_features &= ~value; >>>>>> + supported_ovs_features &= ~feature->value; >>>>>> } >>>>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >>>>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", >>> feature->name, >>>>>> new_state ? "supported" : "not supported"); >>>>>> } >>>>>> } >>>>>> -- >>>>>> 2.31.1 >>>>>> >>>>>> _______________________________________________ >>>>>> dev mailing list >>>>>> dev@openvswitch.org >>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>>>> >>>>>> >>>>> Other than that it looks good. >>>>> >>>>> Acked-by: Ales Musil <amusil@redhat.com> >>>>> >>>> >>>> Thanks, I applied this to main. >>>> >>>> I was wondering if we should backport this though. It's not exactly a >>>> bug fix but it does avoid using a broken datapath feature (when >>>> possible) so it avoids a broken behavior. >>>> >>>> The changes are quite contained. >>>> >>>> CC-ing some of the maintainers explicitly to get some more eyes on > this. >>>> >>> Thanks Dumitru. Sorry that I was following the reported issue. I tried > to >>> look for some more details, but I am not authorized to access bug > 2175716. >> >> Sorry about that, that bug used to be public but some internal policies >> made it private. >> >>> It is not quite clear to me what the impact of this problem is. The OVN >>> issue (112) seems to be a very common configuration of OVN ECMP and I >>> assume the problem is not easily reproduced because I have been >>> testing/deploying the feature and it worked as expected. Do you have > more >>> details about the trigger? Are there any related fixes in OVS and > kernel? >>> >> >> The original bug report came from OpenShift: >> https://issues.redhat.com/browse/OCPBUGS-7406 >> >> CC-ing Patryk as he root caused the original issue. >> >> The simplified topology is: >> +--- GW1 ---+ >> client ---- cluster-router ---+ +--- (out of OVN) --- server >> +--- GW2 ---+ >> >> On cluster-router there's an ECMP route to reach 'server' via: >> - GW1 >> - GW2 >> >> On GW1 traffic going out of OVN is SNATed to IP1. >> On GW2 traffic going out of OVN is SNATed to IP2. >> >> The client initiates a TCP session towards the server; a path is chosen >> based on dp-hash, e.g. GW2. >> >> From this point on, during normal operation, traffic on this session >> flows fine via GW2. >> >> If the client closes the connection, then for the closing connection >> there's an exception when computing the dp-hash if the socket is in >> state TIME_WAIT, substate TCP_FIN_WAIT_2: >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 >> In this last case we call: >> >> tcp_v4_timewait_ack(sk, skb) >> | >> tcp_v4_send_ack(sk, skb, ...) >> >> The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet: >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 >> >> ctl_sk = this_cpu_read(ipv4_tcp_sk); >> sock_net_set(ctl_sk, net); >> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? >> inet_twsk(sk)->tw_mark : sk->sk_mark; >> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? >> inet_twsk(sk)->tw_priority : sk->sk_priority; >> transmit_time = tcp_transmit_time(sk); >> ip_send_unicast_reply(ctl_sk, >> skb, &TCP_SKB_CB(skb)->header.h4.opt, >> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >> &arg, arg.iov[0].iov_len, >> transmit_time); >> >> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is >> computed from the packet contents: >> > https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 >> >> static inline __u32 skb_get_hash(struct sk_buff *skb) >> { >> if (!skb->l4_hash && !skb->sw_hash) >> __skb_get_hash(skb); >> >> return skb->hash; >> } >> [...] >> void __skb_get_hash(struct sk_buff *skb) >> { >> struct flow_keys keys; >> u32 hash; >> >> __flow_hash_secret_init(); >> >> hash = ___skb_get_hash(skb, &keys, &hashrnd); >> >> __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); >> } >> >> But the original socket's txhash was actually a random hash: >> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 >> >> static inline void sk_set_txhash(struct sock *sk) >> { >> /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ >> WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); >> } >> >> This means that the last ack might enter the ovs datapath with a >> different dp_hash than all other packets sent on that session in that >> direction. It's possible that this last ack gets hashed to a different >> path of the ECMP route. >> >> This specific issue was fixed in the kernel through: >> >> > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 >> >> However, while debugging we realized the problem is actually visible >> with _any_ retransmit. That's because a retransmit will trigger a rehash: >> >> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 >> >> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) >> { >> const struct tcp_request_sock_ops *af_ops = > tcp_rsk(req)->af_specific; >> struct flowi fl; >> int res; >> >> /* Paired with WRITE_ONCE() in sock_setsockopt() */ >> if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) >> tcp_rsk(req)->txhash = net_tx_rndhash(); >> res = af_ops->send_synack(sk, NULL, &fl, req, NULL, > TCP_SYNACK_NORMAL, >> NULL); >> [...] >> >> So suddenly all subsequent packets (skbs) will have a different 'hash' >> value and might get hashed to a different path. >> >> This last part was not fixed. After a discussion on the netdev mailing >> list the conclusion was to not change the hash behavior for locally >> terminated TCP sessions: >> >> > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd >> >> Eventually the documentation change patch was dropped and a v2 of the >> other fixes was accepted: >> https://www.spinics.net/lists/netdev/msg906252.html >> >> The rehash on retransmit problem is still present though making the >> kernel's dp-hash implementation unusable. >> >> During some internal discussions Aaron (in CC) pointed out that OVS >> supports L4 symmetric hashing and that should be a stable L4 hash. OVS >> already reports the maximum supported hashing algorithm through the >> Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was >> that the kernel datapath didn't support L4 symmetric hashing. But Aaron >> posted a patch to implement that and the patch got accepted: >> >> https://www.spinics.net/lists/netdev/msg911045.html >> >> Then we moved OVN to use L4 symmetric hashing if available. That's to >> ensure correctness of the packet processing. We did run performance >> tests and the hit due to the extra hashing was acceptable, max 3% >> throughput degradation in OVN scenarios that simulate common traffic >> patterns (like ovn-kubernetes ones) - in most cases the performance >> degradation was around 2%. >> >> Sorry for the long story but I hope it provides all the required >> context. Thinking more about it I should've probably added some of this >> to the original commit log, I apologize. >> >> Regards, >> Dumitru >> > > Thanks Dumitru for the details! It is now quite clear and I am not against > backporting it. > But I have a question regarding upgrading. For the existing traffic, does > this mean the packets may choose a different path after OVN upgrading? > Should this be called out? > Hi Han, That's a good question. They might and this affects all traffic that's not locally originated for the kernel datapath and (potentially) all traffic for the userspace datapath. I can post a follow up patch to update the NEWS file. Did you have anything else in mind? Regards, Dumitru
On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote: > > On 8/1/23 12:51, Han Zhou wrote: > > On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: > >> > >> On 7/18/23 12:14, Han Zhou wrote: > >>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote: > >>>> > >>>> On 7/14/23 08:41, Ales Musil wrote: > >>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> > > wrote: > >>>>> > >>>>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink > >>>>>> datapath). If the datapath supports l4 symmetrical dp-hash use that > >>> one > >>>>>> instead. > >>>>>> > >>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 > >>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 > >>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> > >>>>>> > >>>>> > >>>>> Hi Dumitru, > >>>>> > >>>>> > >>>>>> --- > >>>>>> include/ovn/features.h | 2 ++ > >>>>>> lib/actions.c | 6 ++++++ > >>>>>> lib/features.c | 49 > > +++++++++++++++++++++++++++++++++--------- > >>>>>> 3 files changed, 47 insertions(+), 10 deletions(-) > >>>>>> > >>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h > >>>>>> index de8f1f5489..3bf536127f 100644 > >>>>>> --- a/include/ovn/features.h > >>>>>> +++ b/include/ovn/features.h > >>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { > >>>>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, > >>>>>> OVS_DP_METER_SUPPORT_BIT, > >>>>>> OVS_CT_TUPLE_FLUSH_BIT, > >>>>>> + OVS_DP_HASH_L4_SYM_BIT, > >>>>>> }; > >>>>>> > >>>>>> enum ovs_feature_value { > >>>>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), > >>>>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), > >>>>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), > >>>>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), > >>>>>> }; > >>>>>> > >>>>>> void ovs_feature_support_destroy(void); > >>>>>> diff --git a/lib/actions.c b/lib/actions.c > >>>>>> index 037172e606..9d52cb75a8 100644 > >>>>>> --- a/lib/actions.c > >>>>>> +++ b/lib/actions.c > >>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select > >>> *select, > >>>>>> struct ds ds = DS_EMPTY_INITIALIZER; > >>>>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); > >>>>>> > >>>>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { > >>>>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits > > of > >>>>>> + * selection_method_param to 1: */ > >>>>>> > >>>>> > >>>>> This comment is a bit unfortunate, because it reads like you want to > > set > >>>>> all bits of the upper half > >>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to > >>> value > >>>>> 1 (1 << 32)." during merge, wdyt? > >>>>> > >>>>> > >>>> > >>>> Makes sense, thanks for the suggestion! I changed it accordingly. > >>>> > >>>>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); > >>>>>> + } > >>>>>> + > >>>>>> struct mf_subfield sf = expr_resolve_field(&select->res_field); > >>>>>> > >>>>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; > >>> bucket_id++) { > >>>>>> diff --git a/lib/features.c b/lib/features.c > >>>>>> index 97c9c86f00..d24e8f6c5c 100644 > >>>>>> --- a/lib/features.c > >>>>>> +++ b/lib/features.c > >>>>>> @@ -21,6 +21,7 @@ > >>>>>> #include "lib/dirs.h" > >>>>>> #include "socket-util.h" > >>>>>> #include "lib/vswitch-idl.h" > >>>>>> +#include "odp-netlink.h" > >>>>>> #include "openvswitch/vlog.h" > >>>>>> #include "openvswitch/ofpbuf.h" > >>>>>> #include "openvswitch/rconn.h" > >>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); > >>>>>> > >>>>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 > >>>>>> > >>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the > >>>>>> + * type of capability is supported or not. */ > >>>>>> +typedef bool ovs_feature_parse_func(const struct smap > >>> *ovs_capabilities, > >>>>>> + const char *cap_name); > >>>>>> + > >>>>>> struct ovs_feature { > >>>>>> enum ovs_feature_value value; > >>>>>> const char *name; > >>>>>> + ovs_feature_parse_func *parse; > >>>>>> }; > >>>>>> > >>>>>> +static bool > >>>>>> +bool_parser(const struct smap *ovs_capabilities, const char > > *cap_name) > >>>>>> +{ > >>>>>> + return smap_get_bool(ovs_capabilities, cap_name, false); > >>>>>> +} > >>>>>> + > >>>>>> +static bool > >>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, > >>>>>> + const char *cap_name OVS_UNUSED) > >>>>>> +{ > >>>>>> + int max_hash_alg = smap_get_int(ovs_capabilities, > > "max_hash_alg", > >>> 0); > >>>>>> + > >>>>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; > >>>>>> +} > >>>>>> + > >>>>>> static struct ovs_feature all_ovs_features[] = { > >>>>>> { > >>>>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, > >>>>>> - .name = "ct_zero_snat" > >>>>>> + .name = "ct_zero_snat", > >>>>>> + .parse = bool_parser, > >>>>>> }, > >>>>>> { > >>>>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, > >>>>>> - .name = "ct_flush" > >>>>>> - } > >>>>>> + .name = "ct_flush", > >>>>>> + .parse = bool_parser, > >>>>>> + }, > >>>>>> + { > >>>>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, > >>>>>> + .name = "dp_hash_l4_sym_support", > >>>>>> + .parse = dp_hash_l4_sym_support_parser, > >>>>>> + }, > >>>>>> }; > >>>>>> > >>>>>> /* A bitmap of OVS features that have been detected as 'supported'. > > */ > >>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value > > feature) > >>>>>> case OVS_CT_ZERO_SNAT_SUPPORT: > >>>>>> case OVS_DP_METER_SUPPORT: > >>>>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: > >>>>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: > >>>>>> return true; > >>>>>> default: > >>>>>> return false; > >>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap > >>>>>> *ovs_capabilities, > >>>>>> } > >>>>>> > >>>>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { > >>>>>> - enum ovs_feature_value value = all_ovs_features[i].value; > >>>>>> - const char *name = all_ovs_features[i].name; > >>>>>> - bool old_state = supported_ovs_features & value; > >>>>>> - bool new_state = smap_get_bool(ovs_capabilities, name, > > false); > >>>>>> + struct ovs_feature *feature = &all_ovs_features[i]; > >>>>>> + bool old_state = supported_ovs_features & feature->value; > >>>>>> + bool new_state = feature->parse(ovs_capabilities, > >>> feature->name); > >>>>>> if (new_state != old_state) { > >>>>>> updated = true; > >>>>>> if (new_state) { > >>>>>> - supported_ovs_features |= value; > >>>>>> + supported_ovs_features |= feature->value; > >>>>>> } else { > >>>>>> - supported_ovs_features &= ~value; > >>>>>> + supported_ovs_features &= ~feature->value; > >>>>>> } > >>>>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, > >>>>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", > >>> feature->name, > >>>>>> new_state ? "supported" : "not supported"); > >>>>>> } > >>>>>> } > >>>>>> -- > >>>>>> 2.31.1 > >>>>>> > >>>>>> _______________________________________________ > >>>>>> dev mailing list > >>>>>> dev@openvswitch.org > >>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev > >>>>>> > >>>>>> > >>>>> Other than that it looks good. > >>>>> > >>>>> Acked-by: Ales Musil <amusil@redhat.com> > >>>>> > >>>> > >>>> Thanks, I applied this to main. > >>>> > >>>> I was wondering if we should backport this though. It's not exactly a > >>>> bug fix but it does avoid using a broken datapath feature (when > >>>> possible) so it avoids a broken behavior. > >>>> > >>>> The changes are quite contained. > >>>> > >>>> CC-ing some of the maintainers explicitly to get some more eyes on > > this. > >>>> > >>> Thanks Dumitru. Sorry that I was following the reported issue. I tried > > to > >>> look for some more details, but I am not authorized to access bug > > 2175716. > >> > >> Sorry about that, that bug used to be public but some internal policies > >> made it private. > >> > >>> It is not quite clear to me what the impact of this problem is. The OVN > >>> issue (112) seems to be a very common configuration of OVN ECMP and I > >>> assume the problem is not easily reproduced because I have been > >>> testing/deploying the feature and it worked as expected. Do you have > > more > >>> details about the trigger? Are there any related fixes in OVS and > > kernel? > >>> > >> > >> The original bug report came from OpenShift: > >> https://issues.redhat.com/browse/OCPBUGS-7406 > >> > >> CC-ing Patryk as he root caused the original issue. > >> > >> The simplified topology is: > >> +--- GW1 ---+ > >> client ---- cluster-router ---+ +--- (out of OVN) --- server > >> +--- GW2 ---+ > >> > >> On cluster-router there's an ECMP route to reach 'server' via: > >> - GW1 > >> - GW2 > >> > >> On GW1 traffic going out of OVN is SNATed to IP1. > >> On GW2 traffic going out of OVN is SNATed to IP2. > >> > >> The client initiates a TCP session towards the server; a path is chosen > >> based on dp-hash, e.g. GW2. > >> > >> From this point on, during normal operation, traffic on this session > >> flows fine via GW2. > >> > >> If the client closes the connection, then for the closing connection > >> there's an exception when computing the dp-hash if the socket is in > >> state TIME_WAIT, substate TCP_FIN_WAIT_2: > >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 > >> In this last case we call: > >> > >> tcp_v4_timewait_ack(sk, skb) > >> | > >> tcp_v4_send_ack(sk, skb, ...) > >> > >> The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet: > >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 > >> > >> ctl_sk = this_cpu_read(ipv4_tcp_sk); > >> sock_net_set(ctl_sk, net); > >> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? > >> inet_twsk(sk)->tw_mark : sk->sk_mark; > >> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? > >> inet_twsk(sk)->tw_priority : sk->sk_priority; > >> transmit_time = tcp_transmit_time(sk); > >> ip_send_unicast_reply(ctl_sk, > >> skb, &TCP_SKB_CB(skb)->header.h4.opt, > >> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, > >> &arg, arg.iov[0].iov_len, > >> transmit_time); > >> > >> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is > >> computed from the packet contents: > >> > > https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 > >> > >> static inline __u32 skb_get_hash(struct sk_buff *skb) > >> { > >> if (!skb->l4_hash && !skb->sw_hash) > >> __skb_get_hash(skb); > >> > >> return skb->hash; > >> } > >> [...] > >> void __skb_get_hash(struct sk_buff *skb) > >> { > >> struct flow_keys keys; > >> u32 hash; > >> > >> __flow_hash_secret_init(); > >> > >> hash = ___skb_get_hash(skb, &keys, &hashrnd); > >> > >> __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); > >> } > >> > >> But the original socket's txhash was actually a random hash: > >> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 > >> > >> static inline void sk_set_txhash(struct sock *sk) > >> { > >> /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ > >> WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); > >> } > >> > >> This means that the last ack might enter the ovs datapath with a > >> different dp_hash than all other packets sent on that session in that > >> direction. It's possible that this last ack gets hashed to a different > >> path of the ECMP route. > >> > >> This specific issue was fixed in the kernel through: > >> > >> > > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 > >> > >> However, while debugging we realized the problem is actually visible > >> with _any_ retransmit. That's because a retransmit will trigger a rehash: > >> > >> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 > >> > >> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) > >> { > >> const struct tcp_request_sock_ops *af_ops = > > tcp_rsk(req)->af_specific; > >> struct flowi fl; > >> int res; > >> > >> /* Paired with WRITE_ONCE() in sock_setsockopt() */ > >> if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) > >> tcp_rsk(req)->txhash = net_tx_rndhash(); > >> res = af_ops->send_synack(sk, NULL, &fl, req, NULL, > > TCP_SYNACK_NORMAL, > >> NULL); > >> [...] > >> > >> So suddenly all subsequent packets (skbs) will have a different 'hash' > >> value and might get hashed to a different path. > >> > >> This last part was not fixed. After a discussion on the netdev mailing > >> list the conclusion was to not change the hash behavior for locally > >> terminated TCP sessions: > >> > >> > > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd > >> > >> Eventually the documentation change patch was dropped and a v2 of the > >> other fixes was accepted: > >> https://www.spinics.net/lists/netdev/msg906252.html > >> > >> The rehash on retransmit problem is still present though making the > >> kernel's dp-hash implementation unusable. > >> > >> During some internal discussions Aaron (in CC) pointed out that OVS > >> supports L4 symmetric hashing and that should be a stable L4 hash. OVS > >> already reports the maximum supported hashing algorithm through the > >> Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was > >> that the kernel datapath didn't support L4 symmetric hashing. But Aaron > >> posted a patch to implement that and the patch got accepted: > >> > >> https://www.spinics.net/lists/netdev/msg911045.html > >> > >> Then we moved OVN to use L4 symmetric hashing if available. That's to > >> ensure correctness of the packet processing. We did run performance > >> tests and the hit due to the extra hashing was acceptable, max 3% > >> throughput degradation in OVN scenarios that simulate common traffic > >> patterns (like ovn-kubernetes ones) - in most cases the performance > >> degradation was around 2%. > >> > >> Sorry for the long story but I hope it provides all the required > >> context. Thinking more about it I should've probably added some of this > >> to the original commit log, I apologize. > >> > >> Regards, > >> Dumitru > >> > > > > Thanks Dumitru for the details! It is now quite clear and I am not against > > backporting it. > > But I have a question regarding upgrading. For the existing traffic, does > > this mean the packets may choose a different path after OVN upgrading? > > Should this be called out? > > > > Hi Han, > > That's a good question. They might and this affects all traffic that's > not locally originated for the kernel datapath and (potentially) all > traffic for the userspace datapath. I can post a follow up patch to > update the NEWS file. Did you have anything else in mind? > Sounds good. I don't have anything else. Thanks, Han > Regards, > Dumitru >
On 8/12/23 07:19, Han Zhou wrote: > On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote: >> >> On 8/1/23 12:51, Han Zhou wrote: >>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: >>>> >>>> On 7/18/23 12:14, Han Zhou wrote: >>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> > wrote: >>>>>> >>>>>> On 7/14/23 08:41, Ales Musil wrote: >>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> >>> wrote: >>>>>>> >>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the > netlink >>>>>>>> datapath). If the datapath supports l4 symmetrical dp-hash use > that >>>>> one >>>>>>>> instead. >>>>>>>> >>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 >>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >>>>>>>> >>>>>>> >>>>>>> Hi Dumitru, >>>>>>> >>>>>>> >>>>>>>> --- >>>>>>>> include/ovn/features.h | 2 ++ >>>>>>>> lib/actions.c | 6 ++++++ >>>>>>>> lib/features.c | 49 >>> +++++++++++++++++++++++++++++++++--------- >>>>>>>> 3 files changed, 47 insertions(+), 10 deletions(-) >>>>>>>> >>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h >>>>>>>> index de8f1f5489..3bf536127f 100644 >>>>>>>> --- a/include/ovn/features.h >>>>>>>> +++ b/include/ovn/features.h >>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >>>>>>>> OVS_DP_METER_SUPPORT_BIT, >>>>>>>> OVS_CT_TUPLE_FLUSH_BIT, >>>>>>>> + OVS_DP_HASH_L4_SYM_BIT, >>>>>>>> }; >>>>>>>> >>>>>>>> enum ovs_feature_value { >>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << > OVS_CT_ZERO_SNAT_SUPPORT_BIT), >>>>>>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >>>>>>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >>>>>>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >>>>>>>> }; >>>>>>>> >>>>>>>> void ovs_feature_support_destroy(void); >>>>>>>> diff --git a/lib/actions.c b/lib/actions.c >>>>>>>> index 037172e606..9d52cb75a8 100644 >>>>>>>> --- a/lib/actions.c >>>>>>>> +++ b/lib/actions.c >>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select >>>>> *select, >>>>>>>> struct ds ds = DS_EMPTY_INITIALIZER; >>>>>>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >>>>>>>> >>>>>>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >>>>>>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits >>> of >>>>>>>> + * selection_method_param to 1: */ >>>>>>>> >>>>>>> >>>>>>> This comment is a bit unfortunate, because it reads like you want to >>> set >>>>>>> all bits of the upper half >>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to >>>>> value >>>>>>> 1 (1 << 32)." during merge, wdyt? >>>>>>> >>>>>>> >>>>>> >>>>>> Makes sense, thanks for the suggestion! I changed it accordingly. >>>>>> >>>>>>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >>>>>>>> + } >>>>>>>> + >>>>>>>> struct mf_subfield sf = > expr_resolve_field(&select->res_field); >>>>>>>> >>>>>>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; >>>>> bucket_id++) { >>>>>>>> diff --git a/lib/features.c b/lib/features.c >>>>>>>> index 97c9c86f00..d24e8f6c5c 100644 >>>>>>>> --- a/lib/features.c >>>>>>>> +++ b/lib/features.c >>>>>>>> @@ -21,6 +21,7 @@ >>>>>>>> #include "lib/dirs.h" >>>>>>>> #include "socket-util.h" >>>>>>>> #include "lib/vswitch-idl.h" >>>>>>>> +#include "odp-netlink.h" >>>>>>>> #include "openvswitch/vlog.h" >>>>>>>> #include "openvswitch/ofpbuf.h" >>>>>>>> #include "openvswitch/rconn.h" >>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >>>>>>>> >>>>>>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >>>>>>>> >>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether > the >>>>>>>> + * type of capability is supported or not. */ >>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap >>>>> *ovs_capabilities, >>>>>>>> + const char *cap_name); >>>>>>>> + >>>>>>>> struct ovs_feature { >>>>>>>> enum ovs_feature_value value; >>>>>>>> const char *name; >>>>>>>> + ovs_feature_parse_func *parse; >>>>>>>> }; >>>>>>>> >>>>>>>> +static bool >>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char >>> *cap_name) >>>>>>>> +{ >>>>>>>> + return smap_get_bool(ovs_capabilities, cap_name, false); >>>>>>>> +} >>>>>>>> + >>>>>>>> +static bool >>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >>>>>>>> + const char *cap_name OVS_UNUSED) >>>>>>>> +{ >>>>>>>> + int max_hash_alg = smap_get_int(ovs_capabilities, >>> "max_hash_alg", >>>>> 0); >>>>>>>> + >>>>>>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >>>>>>>> +} >>>>>>>> + >>>>>>>> static struct ovs_feature all_ovs_features[] = { >>>>>>>> { >>>>>>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, >>>>>>>> - .name = "ct_zero_snat" >>>>>>>> + .name = "ct_zero_snat", >>>>>>>> + .parse = bool_parser, >>>>>>>> }, >>>>>>>> { >>>>>>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >>>>>>>> - .name = "ct_flush" >>>>>>>> - } >>>>>>>> + .name = "ct_flush", >>>>>>>> + .parse = bool_parser, >>>>>>>> + }, >>>>>>>> + { >>>>>>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >>>>>>>> + .name = "dp_hash_l4_sym_support", >>>>>>>> + .parse = dp_hash_l4_sym_support_parser, >>>>>>>> + }, >>>>>>>> }; >>>>>>>> >>>>>>>> /* A bitmap of OVS features that have been detected as > 'supported'. >>> */ >>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value >>> feature) >>>>>>>> case OVS_CT_ZERO_SNAT_SUPPORT: >>>>>>>> case OVS_DP_METER_SUPPORT: >>>>>>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: >>>>>>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: >>>>>>>> return true; >>>>>>>> default: >>>>>>>> return false; >>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >>>>>>>> *ovs_capabilities, >>>>>>>> } >>>>>>>> >>>>>>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >>>>>>>> - enum ovs_feature_value value = all_ovs_features[i].value; >>>>>>>> - const char *name = all_ovs_features[i].name; >>>>>>>> - bool old_state = supported_ovs_features & value; >>>>>>>> - bool new_state = smap_get_bool(ovs_capabilities, name, >>> false); >>>>>>>> + struct ovs_feature *feature = &all_ovs_features[i]; >>>>>>>> + bool old_state = supported_ovs_features & feature->value; >>>>>>>> + bool new_state = feature->parse(ovs_capabilities, >>>>> feature->name); >>>>>>>> if (new_state != old_state) { >>>>>>>> updated = true; >>>>>>>> if (new_state) { >>>>>>>> - supported_ovs_features |= value; >>>>>>>> + supported_ovs_features |= feature->value; >>>>>>>> } else { >>>>>>>> - supported_ovs_features &= ~value; >>>>>>>> + supported_ovs_features &= ~feature->value; >>>>>>>> } >>>>>>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >>>>>>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", >>>>> feature->name, >>>>>>>> new_state ? "supported" : "not > supported"); >>>>>>>> } >>>>>>>> } >>>>>>>> -- >>>>>>>> 2.31.1 >>>>>>>> >>>>>>>> _______________________________________________ >>>>>>>> dev mailing list >>>>>>>> dev@openvswitch.org >>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>>>>>> >>>>>>>> >>>>>>> Other than that it looks good. >>>>>>> >>>>>>> Acked-by: Ales Musil <amusil@redhat.com> >>>>>>> >>>>>> >>>>>> Thanks, I applied this to main. >>>>>> >>>>>> I was wondering if we should backport this though. It's not exactly > a >>>>>> bug fix but it does avoid using a broken datapath feature (when >>>>>> possible) so it avoids a broken behavior. >>>>>> >>>>>> The changes are quite contained. >>>>>> >>>>>> CC-ing some of the maintainers explicitly to get some more eyes on >>> this. >>>>>> >>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried >>> to >>>>> look for some more details, but I am not authorized to access bug >>> 2175716. >>>> >>>> Sorry about that, that bug used to be public but some internal policies >>>> made it private. >>>> >>>>> It is not quite clear to me what the impact of this problem is. The > OVN >>>>> issue (112) seems to be a very common configuration of OVN ECMP and I >>>>> assume the problem is not easily reproduced because I have been >>>>> testing/deploying the feature and it worked as expected. Do you have >>> more >>>>> details about the trigger? Are there any related fixes in OVS and >>> kernel? >>>>> >>>> >>>> The original bug report came from OpenShift: >>>> https://issues.redhat.com/browse/OCPBUGS-7406 >>>> >>>> CC-ing Patryk as he root caused the original issue. >>>> >>>> The simplified topology is: >>>> +--- GW1 ---+ >>>> client ---- cluster-router ---+ +--- (out of OVN) --- server >>>> +--- GW2 ---+ >>>> >>>> On cluster-router there's an ECMP route to reach 'server' via: >>>> - GW1 >>>> - GW2 >>>> >>>> On GW1 traffic going out of OVN is SNATed to IP1. >>>> On GW2 traffic going out of OVN is SNATed to IP2. >>>> >>>> The client initiates a TCP session towards the server; a path is chosen >>>> based on dp-hash, e.g. GW2. >>>> >>>> From this point on, during normal operation, traffic on this session >>>> flows fine via GW2. >>>> >>>> If the client closes the connection, then for the closing connection >>>> there's an exception when computing the dp-hash if the socket is in >>>> state TIME_WAIT, substate TCP_FIN_WAIT_2: >>>> > https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 >>>> In this last case we call: >>>> >>>> tcp_v4_timewait_ack(sk, skb) >>>> | >>>> tcp_v4_send_ack(sk, skb, ...) >>>> >>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK > packet: >>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 >>>> >>>> ctl_sk = this_cpu_read(ipv4_tcp_sk); >>>> sock_net_set(ctl_sk, net); >>>> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? >>>> inet_twsk(sk)->tw_mark : sk->sk_mark; >>>> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? >>>> inet_twsk(sk)->tw_priority : > sk->sk_priority; >>>> transmit_time = tcp_transmit_time(sk); >>>> ip_send_unicast_reply(ctl_sk, >>>> skb, &TCP_SKB_CB(skb)->header.h4.opt, >>>> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >>>> &arg, arg.iov[0].iov_len, >>>> transmit_time); >>>> >>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash > is >>>> computed from the packet contents: >>>> >>> > https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 >>>> >>>> static inline __u32 skb_get_hash(struct sk_buff *skb) >>>> { >>>> if (!skb->l4_hash && !skb->sw_hash) >>>> __skb_get_hash(skb); >>>> >>>> return skb->hash; >>>> } >>>> [...] >>>> void __skb_get_hash(struct sk_buff *skb) >>>> { >>>> struct flow_keys keys; >>>> u32 hash; >>>> >>>> __flow_hash_secret_init(); >>>> >>>> hash = ___skb_get_hash(skb, &keys, &hashrnd); >>>> >>>> __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); >>>> } >>>> >>>> But the original socket's txhash was actually a random hash: >>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 >>>> >>>> static inline void sk_set_txhash(struct sock *sk) >>>> { >>>> /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ >>>> WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); >>>> } >>>> >>>> This means that the last ack might enter the ovs datapath with a >>>> different dp_hash than all other packets sent on that session in that >>>> direction. It's possible that this last ack gets hashed to a different >>>> path of the ECMP route. >>>> >>>> This specific issue was fixed in the kernel through: >>>> >>>> >>> > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 >>>> >>>> However, while debugging we realized the problem is actually visible >>>> with _any_ retransmit. That's because a retransmit will trigger a > rehash: >>>> >>>> > https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 >>>> >>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) >>>> { >>>> const struct tcp_request_sock_ops *af_ops = >>> tcp_rsk(req)->af_specific; >>>> struct flowi fl; >>>> int res; >>>> >>>> /* Paired with WRITE_ONCE() in sock_setsockopt() */ >>>> if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) >>>> tcp_rsk(req)->txhash = net_tx_rndhash(); >>>> res = af_ops->send_synack(sk, NULL, &fl, req, NULL, >>> TCP_SYNACK_NORMAL, >>>> NULL); >>>> [...] >>>> >>>> So suddenly all subsequent packets (skbs) will have a different 'hash' >>>> value and might get hashed to a different path. >>>> >>>> This last part was not fixed. After a discussion on the netdev mailing >>>> list the conclusion was to not change the hash behavior for locally >>>> terminated TCP sessions: >>>> >>>> >>> > https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd >>>> >>>> Eventually the documentation change patch was dropped and a v2 of the >>>> other fixes was accepted: >>>> https://www.spinics.net/lists/netdev/msg906252.html >>>> >>>> The rehash on retransmit problem is still present though making the >>>> kernel's dp-hash implementation unusable. >>>> >>>> During some internal discussions Aaron (in CC) pointed out that OVS >>>> supports L4 symmetric hashing and that should be a stable L4 hash. OVS >>>> already reports the maximum supported hashing algorithm through the >>>> Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was >>>> that the kernel datapath didn't support L4 symmetric hashing. But > Aaron >>>> posted a patch to implement that and the patch got accepted: >>>> >>>> https://www.spinics.net/lists/netdev/msg911045.html >>>> >>>> Then we moved OVN to use L4 symmetric hashing if available. That's to >>>> ensure correctness of the packet processing. We did run performance >>>> tests and the hit due to the extra hashing was acceptable, max 3% >>>> throughput degradation in OVN scenarios that simulate common traffic >>>> patterns (like ovn-kubernetes ones) - in most cases the performance >>>> degradation was around 2%. >>>> >>>> Sorry for the long story but I hope it provides all the required >>>> context. Thinking more about it I should've probably added some of > this >>>> to the original commit log, I apologize. >>>> >>>> Regards, >>>> Dumitru >>>> >>> >>> Thanks Dumitru for the details! It is now quite clear and I am not > against >>> backporting it. >>> But I have a question regarding upgrading. For the existing traffic, > does >>> this mean the packets may choose a different path after OVN upgrading? >>> Should this be called out? >>> >> >> Hi Han, >> >> That's a good question. They might and this affects all traffic that's >> not locally originated for the kernel datapath and (potentially) all >> traffic for the userspace datapath. I can post a follow up patch to >> update the NEWS file. Did you have anything else in mind? >> > Sounds good. I don't have anything else. > I posted https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/ Thanks, Dumitru
On 8/14/23 15:57, Dumitru Ceara wrote: > On 8/12/23 07:19, Han Zhou wrote: >> On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote: >>> >>> On 8/1/23 12:51, Han Zhou wrote: >>>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: >>>>> >>>>> On 7/18/23 12:14, Han Zhou wrote: >>>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> >> wrote: >>>>>>> >>>>>>> On 7/14/23 08:41, Ales Musil wrote: >>>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> >>>> wrote: >>>>>>>> >>>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the >> netlink >>>>>>>>> datapath). If the datapath supports l4 symmetrical dp-hash use >> that >>>>>> one >>>>>>>>> instead. >>>>>>>>> >>>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 >>>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >>>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >>>>>>>>> >>>>>>>> >>>>>>>> Hi Dumitru, >>>>>>>> >>>>>>>> >>>>>>>>> --- >>>>>>>>> include/ovn/features.h | 2 ++ >>>>>>>>> lib/actions.c | 6 ++++++ >>>>>>>>> lib/features.c | 49 >>>> +++++++++++++++++++++++++++++++++--------- >>>>>>>>> 3 files changed, 47 insertions(+), 10 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h >>>>>>>>> index de8f1f5489..3bf536127f 100644 >>>>>>>>> --- a/include/ovn/features.h >>>>>>>>> +++ b/include/ovn/features.h >>>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >>>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >>>>>>>>> OVS_DP_METER_SUPPORT_BIT, >>>>>>>>> OVS_CT_TUPLE_FLUSH_BIT, >>>>>>>>> + OVS_DP_HASH_L4_SYM_BIT, >>>>>>>>> }; >>>>>>>>> >>>>>>>>> enum ovs_feature_value { >>>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << >> OVS_CT_ZERO_SNAT_SUPPORT_BIT), >>>>>>>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >>>>>>>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >>>>>>>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >>>>>>>>> }; >>>>>>>>> >>>>>>>>> void ovs_feature_support_destroy(void); >>>>>>>>> diff --git a/lib/actions.c b/lib/actions.c >>>>>>>>> index 037172e606..9d52cb75a8 100644 >>>>>>>>> --- a/lib/actions.c >>>>>>>>> +++ b/lib/actions.c >>>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select >>>>>> *select, >>>>>>>>> struct ds ds = DS_EMPTY_INITIALIZER; >>>>>>>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >>>>>>>>> >>>>>>>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >>>>>>>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits >>>> of >>>>>>>>> + * selection_method_param to 1: */ >>>>>>>>> >>>>>>>> >>>>>>>> This comment is a bit unfortunate, because it reads like you want to >>>> set >>>>>>>> all bits of the upper half >>>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to >>>>>> value >>>>>>>> 1 (1 << 32)." during merge, wdyt? >>>>>>>> >>>>>>>> >>>>>>> >>>>>>> Makes sense, thanks for the suggestion! I changed it accordingly. >>>>>>> >>>>>>>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> struct mf_subfield sf = >> expr_resolve_field(&select->res_field); >>>>>>>>> >>>>>>>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; >>>>>> bucket_id++) { >>>>>>>>> diff --git a/lib/features.c b/lib/features.c >>>>>>>>> index 97c9c86f00..d24e8f6c5c 100644 >>>>>>>>> --- a/lib/features.c >>>>>>>>> +++ b/lib/features.c >>>>>>>>> @@ -21,6 +21,7 @@ >>>>>>>>> #include "lib/dirs.h" >>>>>>>>> #include "socket-util.h" >>>>>>>>> #include "lib/vswitch-idl.h" >>>>>>>>> +#include "odp-netlink.h" >>>>>>>>> #include "openvswitch/vlog.h" >>>>>>>>> #include "openvswitch/ofpbuf.h" >>>>>>>>> #include "openvswitch/rconn.h" >>>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >>>>>>>>> >>>>>>>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >>>>>>>>> >>>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether >> the >>>>>>>>> + * type of capability is supported or not. */ >>>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap >>>>>> *ovs_capabilities, >>>>>>>>> + const char *cap_name); >>>>>>>>> + >>>>>>>>> struct ovs_feature { >>>>>>>>> enum ovs_feature_value value; >>>>>>>>> const char *name; >>>>>>>>> + ovs_feature_parse_func *parse; >>>>>>>>> }; >>>>>>>>> >>>>>>>>> +static bool >>>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char >>>> *cap_name) >>>>>>>>> +{ >>>>>>>>> + return smap_get_bool(ovs_capabilities, cap_name, false); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static bool >>>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >>>>>>>>> + const char *cap_name OVS_UNUSED) >>>>>>>>> +{ >>>>>>>>> + int max_hash_alg = smap_get_int(ovs_capabilities, >>>> "max_hash_alg", >>>>>> 0); >>>>>>>>> + >>>>>>>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> static struct ovs_feature all_ovs_features[] = { >>>>>>>>> { >>>>>>>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, >>>>>>>>> - .name = "ct_zero_snat" >>>>>>>>> + .name = "ct_zero_snat", >>>>>>>>> + .parse = bool_parser, >>>>>>>>> }, >>>>>>>>> { >>>>>>>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >>>>>>>>> - .name = "ct_flush" >>>>>>>>> - } >>>>>>>>> + .name = "ct_flush", >>>>>>>>> + .parse = bool_parser, >>>>>>>>> + }, >>>>>>>>> + { >>>>>>>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >>>>>>>>> + .name = "dp_hash_l4_sym_support", >>>>>>>>> + .parse = dp_hash_l4_sym_support_parser, >>>>>>>>> + }, >>>>>>>>> }; >>>>>>>>> >>>>>>>>> /* A bitmap of OVS features that have been detected as >> 'supported'. >>>> */ >>>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value >>>> feature) >>>>>>>>> case OVS_CT_ZERO_SNAT_SUPPORT: >>>>>>>>> case OVS_DP_METER_SUPPORT: >>>>>>>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: >>>>>>>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: >>>>>>>>> return true; >>>>>>>>> default: >>>>>>>>> return false; >>>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >>>>>>>>> *ovs_capabilities, >>>>>>>>> } >>>>>>>>> >>>>>>>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >>>>>>>>> - enum ovs_feature_value value = all_ovs_features[i].value; >>>>>>>>> - const char *name = all_ovs_features[i].name; >>>>>>>>> - bool old_state = supported_ovs_features & value; >>>>>>>>> - bool new_state = smap_get_bool(ovs_capabilities, name, >>>> false); >>>>>>>>> + struct ovs_feature *feature = &all_ovs_features[i]; >>>>>>>>> + bool old_state = supported_ovs_features & feature->value; >>>>>>>>> + bool new_state = feature->parse(ovs_capabilities, >>>>>> feature->name); >>>>>>>>> if (new_state != old_state) { >>>>>>>>> updated = true; >>>>>>>>> if (new_state) { >>>>>>>>> - supported_ovs_features |= value; >>>>>>>>> + supported_ovs_features |= feature->value; >>>>>>>>> } else { >>>>>>>>> - supported_ovs_features &= ~value; >>>>>>>>> + supported_ovs_features &= ~feature->value; >>>>>>>>> } >>>>>>>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >>>>>>>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", >>>>>> feature->name, >>>>>>>>> new_state ? "supported" : "not >> supported"); >>>>>>>>> } >>>>>>>>> } >>>>>>>>> -- >>>>>>>>> 2.31.1 >>>>>>>>> >>>>>>>>> _______________________________________________ >>>>>>>>> dev mailing list >>>>>>>>> dev@openvswitch.org >>>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>>>>>>> >>>>>>>>> >>>>>>>> Other than that it looks good. >>>>>>>> >>>>>>>> Acked-by: Ales Musil <amusil@redhat.com> >>>>>>>> >>>>>>> >>>>>>> Thanks, I applied this to main. >>>>>>> >>>>>>> I was wondering if we should backport this though. It's not exactly >> a >>>>>>> bug fix but it does avoid using a broken datapath feature (when >>>>>>> possible) so it avoids a broken behavior. >>>>>>> >>>>>>> The changes are quite contained. >>>>>>> >>>>>>> CC-ing some of the maintainers explicitly to get some more eyes on >>>> this. >>>>>>> >>>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried >>>> to >>>>>> look for some more details, but I am not authorized to access bug >>>> 2175716. >>>>> >>>>> Sorry about that, that bug used to be public but some internal policies >>>>> made it private. >>>>> >>>>>> It is not quite clear to me what the impact of this problem is. The >> OVN >>>>>> issue (112) seems to be a very common configuration of OVN ECMP and I >>>>>> assume the problem is not easily reproduced because I have been >>>>>> testing/deploying the feature and it worked as expected. Do you have >>>> more >>>>>> details about the trigger? Are there any related fixes in OVS and >>>> kernel? >>>>>> >>>>> >>>>> The original bug report came from OpenShift: >>>>> https://issues.redhat.com/browse/OCPBUGS-7406 >>>>> >>>>> CC-ing Patryk as he root caused the original issue. >>>>> >>>>> The simplified topology is: >>>>> +--- GW1 ---+ >>>>> client ---- cluster-router ---+ +--- (out of OVN) --- server >>>>> +--- GW2 ---+ >>>>> >>>>> On cluster-router there's an ECMP route to reach 'server' via: >>>>> - GW1 >>>>> - GW2 >>>>> >>>>> On GW1 traffic going out of OVN is SNATed to IP1. >>>>> On GW2 traffic going out of OVN is SNATed to IP2. >>>>> >>>>> The client initiates a TCP session towards the server; a path is chosen >>>>> based on dp-hash, e.g. GW2. >>>>> >>>>> From this point on, during normal operation, traffic on this session >>>>> flows fine via GW2. >>>>> >>>>> If the client closes the connection, then for the closing connection >>>>> there's an exception when computing the dp-hash if the socket is in >>>>> state TIME_WAIT, substate TCP_FIN_WAIT_2: >>>>> >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 >>>>> In this last case we call: >>>>> >>>>> tcp_v4_timewait_ack(sk, skb) >>>>> | >>>>> tcp_v4_send_ack(sk, skb, ...) >>>>> >>>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK >> packet: >>>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 >>>>> >>>>> ctl_sk = this_cpu_read(ipv4_tcp_sk); >>>>> sock_net_set(ctl_sk, net); >>>>> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? >>>>> inet_twsk(sk)->tw_mark : sk->sk_mark; >>>>> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? >>>>> inet_twsk(sk)->tw_priority : >> sk->sk_priority; >>>>> transmit_time = tcp_transmit_time(sk); >>>>> ip_send_unicast_reply(ctl_sk, >>>>> skb, &TCP_SKB_CB(skb)->header.h4.opt, >>>>> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >>>>> &arg, arg.iov[0].iov_len, >>>>> transmit_time); >>>>> >>>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash >> is >>>>> computed from the packet contents: >>>>> >>>> >> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 >>>>> >>>>> static inline __u32 skb_get_hash(struct sk_buff *skb) >>>>> { >>>>> if (!skb->l4_hash && !skb->sw_hash) >>>>> __skb_get_hash(skb); >>>>> >>>>> return skb->hash; >>>>> } >>>>> [...] >>>>> void __skb_get_hash(struct sk_buff *skb) >>>>> { >>>>> struct flow_keys keys; >>>>> u32 hash; >>>>> >>>>> __flow_hash_secret_init(); >>>>> >>>>> hash = ___skb_get_hash(skb, &keys, &hashrnd); >>>>> >>>>> __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); >>>>> } >>>>> >>>>> But the original socket's txhash was actually a random hash: >>>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 >>>>> >>>>> static inline void sk_set_txhash(struct sock *sk) >>>>> { >>>>> /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ >>>>> WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); >>>>> } >>>>> >>>>> This means that the last ack might enter the ovs datapath with a >>>>> different dp_hash than all other packets sent on that session in that >>>>> direction. It's possible that this last ack gets hashed to a different >>>>> path of the ECMP route. >>>>> >>>>> This specific issue was fixed in the kernel through: >>>>> >>>>> >>>> >> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 >>>>> >>>>> However, while debugging we realized the problem is actually visible >>>>> with _any_ retransmit. That's because a retransmit will trigger a >> rehash: >>>>> >>>>> >> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 >>>>> >>>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) >>>>> { >>>>> const struct tcp_request_sock_ops *af_ops = >>>> tcp_rsk(req)->af_specific; >>>>> struct flowi fl; >>>>> int res; >>>>> >>>>> /* Paired with WRITE_ONCE() in sock_setsockopt() */ >>>>> if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) >>>>> tcp_rsk(req)->txhash = net_tx_rndhash(); >>>>> res = af_ops->send_synack(sk, NULL, &fl, req, NULL, >>>> TCP_SYNACK_NORMAL, >>>>> NULL); >>>>> [...] >>>>> >>>>> So suddenly all subsequent packets (skbs) will have a different 'hash' >>>>> value and might get hashed to a different path. >>>>> >>>>> This last part was not fixed. After a discussion on the netdev mailing >>>>> list the conclusion was to not change the hash behavior for locally >>>>> terminated TCP sessions: >>>>> >>>>> >>>> >> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd >>>>> >>>>> Eventually the documentation change patch was dropped and a v2 of the >>>>> other fixes was accepted: >>>>> https://www.spinics.net/lists/netdev/msg906252.html >>>>> >>>>> The rehash on retransmit problem is still present though making the >>>>> kernel's dp-hash implementation unusable. >>>>> >>>>> During some internal discussions Aaron (in CC) pointed out that OVS >>>>> supports L4 symmetric hashing and that should be a stable L4 hash. OVS >>>>> already reports the maximum supported hashing algorithm through the >>>>> Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was >>>>> that the kernel datapath didn't support L4 symmetric hashing. But >> Aaron >>>>> posted a patch to implement that and the patch got accepted: >>>>> >>>>> https://www.spinics.net/lists/netdev/msg911045.html >>>>> >>>>> Then we moved OVN to use L4 symmetric hashing if available. That's to >>>>> ensure correctness of the packet processing. We did run performance >>>>> tests and the hit due to the extra hashing was acceptable, max 3% >>>>> throughput degradation in OVN scenarios that simulate common traffic >>>>> patterns (like ovn-kubernetes ones) - in most cases the performance >>>>> degradation was around 2%. >>>>> >>>>> Sorry for the long story but I hope it provides all the required >>>>> context. Thinking more about it I should've probably added some of >> this >>>>> to the original commit log, I apologize. >>>>> >>>>> Regards, >>>>> Dumitru >>>>> >>>> >>>> Thanks Dumitru for the details! It is now quite clear and I am not >> against >>>> backporting it. >>>> But I have a question regarding upgrading. For the existing traffic, >> does >>>> this mean the packets may choose a different path after OVN upgrading? >>>> Should this be called out? >>>> >>> >>> Hi Han, >>> >>> That's a good question. They might and this affects all traffic that's >>> not locally originated for the kernel datapath and (potentially) all >>> traffic for the userspace datapath. I can post a follow up patch to >>> update the NEWS file. Did you have anything else in mind? >>> >> Sounds good. I don't have anything else. >> > > I posted > https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/ > I backported the commit and the NEWS update to branches 23.06 and 23.03. It didn't apply cleanly on older branches but I think that's fine for now. We can backport it further later if users request it. Regards, Dumitru
On 8/15/23 12:39, Dumitru Ceara wrote: > On 8/14/23 15:57, Dumitru Ceara wrote: >> On 8/12/23 07:19, Han Zhou wrote: >>> On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote: >>>> >>>> On 8/1/23 12:51, Han Zhou wrote: >>>>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote: >>>>>> >>>>>> On 7/18/23 12:14, Han Zhou wrote: >>>>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> >>> wrote: >>>>>>>> >>>>>>>> On 7/14/23 08:41, Ales Musil wrote: >>>>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> >>>>> wrote: >>>>>>>>> >>>>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the >>> netlink >>>>>>>>>> datapath). If the datapath supports l4 symmetrical dp-hash use >>> that >>>>>>> one >>>>>>>>>> instead. >>>>>>>>>> >>>>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112 >>>>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 >>>>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com> >>>>>>>>>> >>>>>>>>> >>>>>>>>> Hi Dumitru, >>>>>>>>> >>>>>>>>> >>>>>>>>>> --- >>>>>>>>>> include/ovn/features.h | 2 ++ >>>>>>>>>> lib/actions.c | 6 ++++++ >>>>>>>>>> lib/features.c | 49 >>>>> +++++++++++++++++++++++++++++++++--------- >>>>>>>>>> 3 files changed, 47 insertions(+), 10 deletions(-) >>>>>>>>>> >>>>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h >>>>>>>>>> index de8f1f5489..3bf536127f 100644 >>>>>>>>>> --- a/include/ovn/features.h >>>>>>>>>> +++ b/include/ovn/features.h >>>>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { >>>>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT, >>>>>>>>>> OVS_DP_METER_SUPPORT_BIT, >>>>>>>>>> OVS_CT_TUPLE_FLUSH_BIT, >>>>>>>>>> + OVS_DP_HASH_L4_SYM_BIT, >>>>>>>>>> }; >>>>>>>>>> >>>>>>>>>> enum ovs_feature_value { >>>>>>>>>> OVS_CT_ZERO_SNAT_SUPPORT = (1 << >>> OVS_CT_ZERO_SNAT_SUPPORT_BIT), >>>>>>>>>> OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), >>>>>>>>>> OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), >>>>>>>>>> + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), >>>>>>>>>> }; >>>>>>>>>> >>>>>>>>>> void ovs_feature_support_destroy(void); >>>>>>>>>> diff --git a/lib/actions.c b/lib/actions.c >>>>>>>>>> index 037172e606..9d52cb75a8 100644 >>>>>>>>>> --- a/lib/actions.c >>>>>>>>>> +++ b/lib/actions.c >>>>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select >>>>>>> *select, >>>>>>>>>> struct ds ds = DS_EMPTY_INITIALIZER; >>>>>>>>>> ds_put_format(&ds, "type=select,selection_method=dp_hash"); >>>>>>>>>> >>>>>>>>>> + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { >>>>>>>>>> + /* Select dp-hash l4_symmetric by setting the upper 32bits >>>>> of >>>>>>>>>> + * selection_method_param to 1: */ >>>>>>>>>> >>>>>>>>> >>>>>>>>> This comment is a bit unfortunate, because it reads like you want to >>>>> set >>>>>>>>> all bits of the upper half >>>>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to >>>>>>> value >>>>>>>>> 1 (1 << 32)." during merge, wdyt? >>>>>>>>> >>>>>>>>> >>>>>>>> >>>>>>>> Makes sense, thanks for the suggestion! I changed it accordingly. >>>>>>>> >>>>>>>>>> + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); >>>>>>>>>> + } >>>>>>>>>> + >>>>>>>>>> struct mf_subfield sf = >>> expr_resolve_field(&select->res_field); >>>>>>>>>> >>>>>>>>>> for (size_t bucket_id = 0; bucket_id < select->n_dsts; >>>>>>> bucket_id++) { >>>>>>>>>> diff --git a/lib/features.c b/lib/features.c >>>>>>>>>> index 97c9c86f00..d24e8f6c5c 100644 >>>>>>>>>> --- a/lib/features.c >>>>>>>>>> +++ b/lib/features.c >>>>>>>>>> @@ -21,6 +21,7 @@ >>>>>>>>>> #include "lib/dirs.h" >>>>>>>>>> #include "socket-util.h" >>>>>>>>>> #include "lib/vswitch-idl.h" >>>>>>>>>> +#include "odp-netlink.h" >>>>>>>>>> #include "openvswitch/vlog.h" >>>>>>>>>> #include "openvswitch/ofpbuf.h" >>>>>>>>>> #include "openvswitch/rconn.h" >>>>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); >>>>>>>>>> >>>>>>>>>> #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 >>>>>>>>>> >>>>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether >>> the >>>>>>>>>> + * type of capability is supported or not. */ >>>>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap >>>>>>> *ovs_capabilities, >>>>>>>>>> + const char *cap_name); >>>>>>>>>> + >>>>>>>>>> struct ovs_feature { >>>>>>>>>> enum ovs_feature_value value; >>>>>>>>>> const char *name; >>>>>>>>>> + ovs_feature_parse_func *parse; >>>>>>>>>> }; >>>>>>>>>> >>>>>>>>>> +static bool >>>>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char >>>>> *cap_name) >>>>>>>>>> +{ >>>>>>>>>> + return smap_get_bool(ovs_capabilities, cap_name, false); >>>>>>>>>> +} >>>>>>>>>> + >>>>>>>>>> +static bool >>>>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, >>>>>>>>>> + const char *cap_name OVS_UNUSED) >>>>>>>>>> +{ >>>>>>>>>> + int max_hash_alg = smap_get_int(ovs_capabilities, >>>>> "max_hash_alg", >>>>>>> 0); >>>>>>>>>> + >>>>>>>>>> + return max_hash_alg == OVS_HASH_ALG_SYM_L4; >>>>>>>>>> +} >>>>>>>>>> + >>>>>>>>>> static struct ovs_feature all_ovs_features[] = { >>>>>>>>>> { >>>>>>>>>> .value = OVS_CT_ZERO_SNAT_SUPPORT, >>>>>>>>>> - .name = "ct_zero_snat" >>>>>>>>>> + .name = "ct_zero_snat", >>>>>>>>>> + .parse = bool_parser, >>>>>>>>>> }, >>>>>>>>>> { >>>>>>>>>> .value = OVS_CT_TUPLE_FLUSH_SUPPORT, >>>>>>>>>> - .name = "ct_flush" >>>>>>>>>> - } >>>>>>>>>> + .name = "ct_flush", >>>>>>>>>> + .parse = bool_parser, >>>>>>>>>> + }, >>>>>>>>>> + { >>>>>>>>>> + .value = OVS_DP_HASH_L4_SYM_SUPPORT, >>>>>>>>>> + .name = "dp_hash_l4_sym_support", >>>>>>>>>> + .parse = dp_hash_l4_sym_support_parser, >>>>>>>>>> + }, >>>>>>>>>> }; >>>>>>>>>> >>>>>>>>>> /* A bitmap of OVS features that have been detected as >>> 'supported'. >>>>> */ >>>>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value >>>>> feature) >>>>>>>>>> case OVS_CT_ZERO_SNAT_SUPPORT: >>>>>>>>>> case OVS_DP_METER_SUPPORT: >>>>>>>>>> case OVS_CT_TUPLE_FLUSH_SUPPORT: >>>>>>>>>> + case OVS_DP_HASH_L4_SYM_SUPPORT: >>>>>>>>>> return true; >>>>>>>>>> default: >>>>>>>>>> return false; >>>>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap >>>>>>>>>> *ovs_capabilities, >>>>>>>>>> } >>>>>>>>>> >>>>>>>>>> for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { >>>>>>>>>> - enum ovs_feature_value value = all_ovs_features[i].value; >>>>>>>>>> - const char *name = all_ovs_features[i].name; >>>>>>>>>> - bool old_state = supported_ovs_features & value; >>>>>>>>>> - bool new_state = smap_get_bool(ovs_capabilities, name, >>>>> false); >>>>>>>>>> + struct ovs_feature *feature = &all_ovs_features[i]; >>>>>>>>>> + bool old_state = supported_ovs_features & feature->value; >>>>>>>>>> + bool new_state = feature->parse(ovs_capabilities, >>>>>>> feature->name); >>>>>>>>>> if (new_state != old_state) { >>>>>>>>>> updated = true; >>>>>>>>>> if (new_state) { >>>>>>>>>> - supported_ovs_features |= value; >>>>>>>>>> + supported_ovs_features |= feature->value; >>>>>>>>>> } else { >>>>>>>>>> - supported_ovs_features &= ~value; >>>>>>>>>> + supported_ovs_features &= ~feature->value; >>>>>>>>>> } >>>>>>>>>> - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, >>>>>>>>>> + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", >>>>>>> feature->name, >>>>>>>>>> new_state ? "supported" : "not >>> supported"); >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> -- >>>>>>>>>> 2.31.1 >>>>>>>>>> >>>>>>>>>> _______________________________________________ >>>>>>>>>> dev mailing list >>>>>>>>>> dev@openvswitch.org >>>>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>>>>>>>> >>>>>>>>>> >>>>>>>>> Other than that it looks good. >>>>>>>>> >>>>>>>>> Acked-by: Ales Musil <amusil@redhat.com> >>>>>>>>> >>>>>>>> >>>>>>>> Thanks, I applied this to main. >>>>>>>> >>>>>>>> I was wondering if we should backport this though. It's not exactly >>> a >>>>>>>> bug fix but it does avoid using a broken datapath feature (when >>>>>>>> possible) so it avoids a broken behavior. >>>>>>>> >>>>>>>> The changes are quite contained. >>>>>>>> >>>>>>>> CC-ing some of the maintainers explicitly to get some more eyes on >>>>> this. >>>>>>>> >>>>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried >>>>> to >>>>>>> look for some more details, but I am not authorized to access bug >>>>> 2175716. >>>>>> >>>>>> Sorry about that, that bug used to be public but some internal policies >>>>>> made it private. >>>>>> >>>>>>> It is not quite clear to me what the impact of this problem is. The >>> OVN >>>>>>> issue (112) seems to be a very common configuration of OVN ECMP and I >>>>>>> assume the problem is not easily reproduced because I have been >>>>>>> testing/deploying the feature and it worked as expected. Do you have >>>>> more >>>>>>> details about the trigger? Are there any related fixes in OVS and >>>>> kernel? >>>>>>> >>>>>> >>>>>> The original bug report came from OpenShift: >>>>>> https://issues.redhat.com/browse/OCPBUGS-7406 >>>>>> >>>>>> CC-ing Patryk as he root caused the original issue. >>>>>> >>>>>> The simplified topology is: >>>>>> +--- GW1 ---+ >>>>>> client ---- cluster-router ---+ +--- (out of OVN) --- server >>>>>> +--- GW2 ---+ >>>>>> >>>>>> On cluster-router there's an ECMP route to reach 'server' via: >>>>>> - GW1 >>>>>> - GW2 >>>>>> >>>>>> On GW1 traffic going out of OVN is SNATed to IP1. >>>>>> On GW2 traffic going out of OVN is SNATed to IP2. >>>>>> >>>>>> The client initiates a TCP session towards the server; a path is chosen >>>>>> based on dp-hash, e.g. GW2. >>>>>> >>>>>> From this point on, during normal operation, traffic on this session >>>>>> flows fine via GW2. >>>>>> >>>>>> If the client closes the connection, then for the closing connection >>>>>> there's an exception when computing the dp-hash if the socket is in >>>>>> state TIME_WAIT, substate TCP_FIN_WAIT_2: >>>>>> >>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218 >>>>>> In this last case we call: >>>>>> >>>>>> tcp_v4_timewait_ack(sk, skb) >>>>>> | >>>>>> tcp_v4_send_ack(sk, skb, ...) >>>>>> >>>>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK >>> packet: >>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925 >>>>>> >>>>>> ctl_sk = this_cpu_read(ipv4_tcp_sk); >>>>>> sock_net_set(ctl_sk, net); >>>>>> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? >>>>>> inet_twsk(sk)->tw_mark : sk->sk_mark; >>>>>> ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? >>>>>> inet_twsk(sk)->tw_priority : >>> sk->sk_priority; >>>>>> transmit_time = tcp_transmit_time(sk); >>>>>> ip_send_unicast_reply(ctl_sk, >>>>>> skb, &TCP_SKB_CB(skb)->header.h4.opt, >>>>>> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >>>>>> &arg, arg.iov[0].iov_len, >>>>>> transmit_time); >>>>>> >>>>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash >>> is >>>>>> computed from the packet contents: >>>>>> >>>>> >>> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540 >>>>>> >>>>>> static inline __u32 skb_get_hash(struct sk_buff *skb) >>>>>> { >>>>>> if (!skb->l4_hash && !skb->sw_hash) >>>>>> __skb_get_hash(skb); >>>>>> >>>>>> return skb->hash; >>>>>> } >>>>>> [...] >>>>>> void __skb_get_hash(struct sk_buff *skb) >>>>>> { >>>>>> struct flow_keys keys; >>>>>> u32 hash; >>>>>> >>>>>> __flow_hash_secret_init(); >>>>>> >>>>>> hash = ___skb_get_hash(skb, &keys, &hashrnd); >>>>>> >>>>>> __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); >>>>>> } >>>>>> >>>>>> But the original socket's txhash was actually a random hash: >>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128 >>>>>> >>>>>> static inline void sk_set_txhash(struct sock *sk) >>>>>> { >>>>>> /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ >>>>>> WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); >>>>>> } >>>>>> >>>>>> This means that the last ack might enter the ovs datapath with a >>>>>> different dp_hash than all other packets sent on that session in that >>>>>> direction. It's possible that this last ack gets hashed to a different >>>>>> path of the ECMP route. >>>>>> >>>>>> This specific issue was fixed in the kernel through: >>>>>> >>>>>> >>>>> >>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4 >>>>>> >>>>>> However, while debugging we realized the problem is actually visible >>>>>> with _any_ retransmit. That's because a retransmit will trigger a >>> rehash: >>>>>> >>>>>> >>> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124 >>>>>> >>>>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) >>>>>> { >>>>>> const struct tcp_request_sock_ops *af_ops = >>>>> tcp_rsk(req)->af_specific; >>>>>> struct flowi fl; >>>>>> int res; >>>>>> >>>>>> /* Paired with WRITE_ONCE() in sock_setsockopt() */ >>>>>> if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) >>>>>> tcp_rsk(req)->txhash = net_tx_rndhash(); >>>>>> res = af_ops->send_synack(sk, NULL, &fl, req, NULL, >>>>> TCP_SYNACK_NORMAL, >>>>>> NULL); >>>>>> [...] >>>>>> >>>>>> So suddenly all subsequent packets (skbs) will have a different 'hash' >>>>>> value and might get hashed to a different path. >>>>>> >>>>>> This last part was not fixed. After a discussion on the netdev mailing >>>>>> list the conclusion was to not change the hash behavior for locally >>>>>> terminated TCP sessions: >>>>>> >>>>>> >>>>> >>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd >>>>>> >>>>>> Eventually the documentation change patch was dropped and a v2 of the >>>>>> other fixes was accepted: >>>>>> https://www.spinics.net/lists/netdev/msg906252.html >>>>>> >>>>>> The rehash on retransmit problem is still present though making the >>>>>> kernel's dp-hash implementation unusable. >>>>>> >>>>>> During some internal discussions Aaron (in CC) pointed out that OVS >>>>>> supports L4 symmetric hashing and that should be a stable L4 hash. OVS >>>>>> already reports the maximum supported hashing algorithm through the >>>>>> Datapath.Capabilities.max_hash_alg ovsdb field. The only problem was >>>>>> that the kernel datapath didn't support L4 symmetric hashing. But >>> Aaron >>>>>> posted a patch to implement that and the patch got accepted: >>>>>> >>>>>> https://www.spinics.net/lists/netdev/msg911045.html >>>>>> >>>>>> Then we moved OVN to use L4 symmetric hashing if available. That's to >>>>>> ensure correctness of the packet processing. We did run performance >>>>>> tests and the hit due to the extra hashing was acceptable, max 3% >>>>>> throughput degradation in OVN scenarios that simulate common traffic >>>>>> patterns (like ovn-kubernetes ones) - in most cases the performance >>>>>> degradation was around 2%. >>>>>> >>>>>> Sorry for the long story but I hope it provides all the required >>>>>> context. Thinking more about it I should've probably added some of >>> this >>>>>> to the original commit log, I apologize. >>>>>> >>>>>> Regards, >>>>>> Dumitru >>>>>> >>>>> >>>>> Thanks Dumitru for the details! It is now quite clear and I am not >>> against >>>>> backporting it. >>>>> But I have a question regarding upgrading. For the existing traffic, >>> does >>>>> this mean the packets may choose a different path after OVN upgrading? >>>>> Should this be called out? >>>>> >>>> >>>> Hi Han, >>>> >>>> That's a good question. They might and this affects all traffic that's >>>> not locally originated for the kernel datapath and (potentially) all >>>> traffic for the userspace datapath. I can post a follow up patch to >>>> update the NEWS file. Did you have anything else in mind? >>>> >>> Sounds good. I don't have anything else. >>> >> >> I posted >> https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/ >> > > I backported the commit and the NEWS update to branches 23.06 and 23.03. > It didn't apply cleanly on older branches but I think that's fine for > now. We can backport it further later if users request it. > We actually got an internal request to backport this to 22.12 so I resolved the conflicts (nothing major) and pushed these commits to branch-22.12 too. Regards, Dumitru
diff --git a/include/ovn/features.h b/include/ovn/features.h index de8f1f5489..3bf536127f 100644 --- a/include/ovn/features.h +++ b/include/ovn/features.h @@ -34,12 +34,14 @@ enum ovs_feature_support_bits { OVS_CT_ZERO_SNAT_SUPPORT_BIT, OVS_DP_METER_SUPPORT_BIT, OVS_CT_TUPLE_FLUSH_BIT, + OVS_DP_HASH_L4_SYM_BIT, }; enum ovs_feature_value { OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT), OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT), OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT), + OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT), }; void ovs_feature_support_destroy(void); diff --git a/lib/actions.c b/lib/actions.c index 037172e606..9d52cb75a8 100644 --- a/lib/actions.c +++ b/lib/actions.c @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select, struct ds ds = DS_EMPTY_INITIALIZER; ds_put_format(&ds, "type=select,selection_method=dp_hash"); + if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) { + /* Select dp-hash l4_symmetric by setting the upper 32bits of + * selection_method_param to 1: */ + ds_put_cstr(&ds, ",selection_method_param=0x100000000"); + } + struct mf_subfield sf = expr_resolve_field(&select->res_field); for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) { diff --git a/lib/features.c b/lib/features.c index 97c9c86f00..d24e8f6c5c 100644 --- a/lib/features.c +++ b/lib/features.c @@ -21,6 +21,7 @@ #include "lib/dirs.h" #include "socket-util.h" #include "lib/vswitch-idl.h" +#include "odp-netlink.h" #include "openvswitch/vlog.h" #include "openvswitch/ofpbuf.h" #include "openvswitch/rconn.h" @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features); #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5 +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the + * type of capability is supported or not. */ +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities, + const char *cap_name); + struct ovs_feature { enum ovs_feature_value value; const char *name; + ovs_feature_parse_func *parse; }; +static bool +bool_parser(const struct smap *ovs_capabilities, const char *cap_name) +{ + return smap_get_bool(ovs_capabilities, cap_name, false); +} + +static bool +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities, + const char *cap_name OVS_UNUSED) +{ + int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0); + + return max_hash_alg == OVS_HASH_ALG_SYM_L4; +} + static struct ovs_feature all_ovs_features[] = { { .value = OVS_CT_ZERO_SNAT_SUPPORT, - .name = "ct_zero_snat" + .name = "ct_zero_snat", + .parse = bool_parser, }, { .value = OVS_CT_TUPLE_FLUSH_SUPPORT, - .name = "ct_flush" - } + .name = "ct_flush", + .parse = bool_parser, + }, + { + .value = OVS_DP_HASH_L4_SYM_SUPPORT, + .name = "dp_hash_l4_sym_support", + .parse = dp_hash_l4_sym_support_parser, + }, }; /* A bitmap of OVS features that have been detected as 'supported'. */ @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature) case OVS_CT_ZERO_SNAT_SUPPORT: case OVS_DP_METER_SUPPORT: case OVS_CT_TUPLE_FLUSH_SUPPORT: + case OVS_DP_HASH_L4_SYM_SUPPORT: return true; default: return false; @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap *ovs_capabilities, } for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) { - enum ovs_feature_value value = all_ovs_features[i].value; - const char *name = all_ovs_features[i].name; - bool old_state = supported_ovs_features & value; - bool new_state = smap_get_bool(ovs_capabilities, name, false); + struct ovs_feature *feature = &all_ovs_features[i]; + bool old_state = supported_ovs_features & feature->value; + bool new_state = feature->parse(ovs_capabilities, feature->name); if (new_state != old_state) { updated = true; if (new_state) { - supported_ovs_features |= value; + supported_ovs_features |= feature->value; } else { - supported_ovs_features &= ~value; + supported_ovs_features &= ~feature->value; } - VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name, + VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name, new_state ? "supported" : "not supported"); } }
Regular dp-hash is not a canonical L4 hash (at least with the netlink datapath). If the datapath supports l4 symmetrical dp-hash use that one instead. Reported-at: https://github.com/ovn-org/ovn/issues/112 Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679 Signed-off-by: Dumitru Ceara <dceara@redhat.com> --- include/ovn/features.h | 2 ++ lib/actions.c | 6 ++++++ lib/features.c | 49 +++++++++++++++++++++++++++++++++--------- 3 files changed, 47 insertions(+), 10 deletions(-)