diff mbox series

[ovs-dev] ovn-controller: Detect and use L4_SYM dp-hash if available.

Message ID 20230713143810.347542-1-dceara@redhat.com
State Accepted
Headers show
Series [ovs-dev] ovn-controller: Detect and use L4_SYM dp-hash if available. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Dumitru Ceara July 13, 2023, 2:38 p.m. UTC
Regular dp-hash is not a canonical L4 hash (at least with the netlink
datapath).  If the datapath supports l4 symmetrical dp-hash use that one
instead.

Reported-at: https://github.com/ovn-org/ovn/issues/112
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 include/ovn/features.h |  2 ++
 lib/actions.c          |  6 ++++++
 lib/features.c         | 49 +++++++++++++++++++++++++++++++++---------
 3 files changed, 47 insertions(+), 10 deletions(-)

Comments

Ales Musil July 14, 2023, 6:41 a.m. UTC | #1
On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote:

> Regular dp-hash is not a canonical L4 hash (at least with the netlink
> datapath).  If the datapath supports l4 symmetrical dp-hash use that one
> instead.
>
> Reported-at: https://github.com/ovn-org/ovn/issues/112
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>

Hi Dumitru,


> ---
>  include/ovn/features.h |  2 ++
>  lib/actions.c          |  6 ++++++
>  lib/features.c         | 49 +++++++++++++++++++++++++++++++++---------
>  3 files changed, 47 insertions(+), 10 deletions(-)
>
> diff --git a/include/ovn/features.h b/include/ovn/features.h
> index de8f1f5489..3bf536127f 100644
> --- a/include/ovn/features.h
> +++ b/include/ovn/features.h
> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>      OVS_DP_METER_SUPPORT_BIT,
>      OVS_CT_TUPLE_FLUSH_BIT,
> +    OVS_DP_HASH_L4_SYM_BIT,
>  };
>
>  enum ovs_feature_value {
>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>  };
>
>  void ovs_feature_support_destroy(void);
> diff --git a/lib/actions.c b/lib/actions.c
> index 037172e606..9d52cb75a8 100644
> --- a/lib/actions.c
> +++ b/lib/actions.c
> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select,
>      struct ds ds = DS_EMPTY_INITIALIZER;
>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>
> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
> +        /* Select dp-hash l4_symmetric by setting the upper 32bits of
> +         * selection_method_param to 1: */
>

This comment is a bit unfortunate, because it reads like you want to set
all bits of the upper half
to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to value
1 (1 << 32)." during merge, wdyt?


> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
> +    }
> +
>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
>
>      for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) {
> diff --git a/lib/features.c b/lib/features.c
> index 97c9c86f00..d24e8f6c5c 100644
> --- a/lib/features.c
> +++ b/lib/features.c
> @@ -21,6 +21,7 @@
>  #include "lib/dirs.h"
>  #include "socket-util.h"
>  #include "lib/vswitch-idl.h"
> +#include "odp-netlink.h"
>  #include "openvswitch/vlog.h"
>  #include "openvswitch/ofpbuf.h"
>  #include "openvswitch/rconn.h"
> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>
>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>
> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
> + * type of capability is supported or not. */
> +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities,
> +                                    const char *cap_name);
> +
>  struct ovs_feature {
>      enum ovs_feature_value value;
>      const char *name;
> +    ovs_feature_parse_func *parse;
>  };
>
> +static bool
> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name)
> +{
> +    return smap_get_bool(ovs_capabilities, cap_name, false);
> +}
> +
> +static bool
> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
> +                              const char *cap_name OVS_UNUSED)
> +{
> +    int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0);
> +
> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
> +}
> +
>  static struct ovs_feature all_ovs_features[] = {
>      {
>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
> -        .name = "ct_zero_snat"
> +        .name = "ct_zero_snat",
> +        .parse = bool_parser,
>      },
>      {
>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
> -        .name = "ct_flush"
> -    }
> +        .name = "ct_flush",
> +        .parse = bool_parser,
> +    },
> +    {
> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
> +        .name = "dp_hash_l4_sym_support",
> +        .parse = dp_hash_l4_sym_support_parser,
> +    },
>  };
>
>  /* A bitmap of OVS features that have been detected as 'supported'. */
> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature)
>      case OVS_CT_ZERO_SNAT_SUPPORT:
>      case OVS_DP_METER_SUPPORT:
>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>          return true;
>      default:
>          return false;
> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
> *ovs_capabilities,
>      }
>
>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
> -        enum ovs_feature_value value = all_ovs_features[i].value;
> -        const char *name = all_ovs_features[i].name;
> -        bool old_state = supported_ovs_features & value;
> -        bool new_state = smap_get_bool(ovs_capabilities, name, false);
> +        struct ovs_feature *feature = &all_ovs_features[i];
> +        bool old_state = supported_ovs_features & feature->value;
> +        bool new_state = feature->parse(ovs_capabilities, feature->name);
>          if (new_state != old_state) {
>              updated = true;
>              if (new_state) {
> -                supported_ovs_features |= value;
> +                supported_ovs_features |= feature->value;
>              } else {
> -                supported_ovs_features &= ~value;
> +                supported_ovs_features &= ~feature->value;
>              }
> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name,
>                           new_state ? "supported" : "not supported");
>          }
>      }
> --
> 2.31.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Other than that it looks good.

Acked-by: Ales Musil <amusil@redhat.com>
Dumitru Ceara July 17, 2023, 1:51 p.m. UTC | #2
On 7/14/23 08:41, Ales Musil wrote:
> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote:
> 
>> Regular dp-hash is not a canonical L4 hash (at least with the netlink
>> datapath).  If the datapath supports l4 symmetrical dp-hash use that one
>> instead.
>>
>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>
> 
> Hi Dumitru,
> 
> 
>> ---
>>  include/ovn/features.h |  2 ++
>>  lib/actions.c          |  6 ++++++
>>  lib/features.c         | 49 +++++++++++++++++++++++++++++++++---------
>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>> index de8f1f5489..3bf536127f 100644
>> --- a/include/ovn/features.h
>> +++ b/include/ovn/features.h
>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>      OVS_DP_METER_SUPPORT_BIT,
>>      OVS_CT_TUPLE_FLUSH_BIT,
>> +    OVS_DP_HASH_L4_SYM_BIT,
>>  };
>>
>>  enum ovs_feature_value {
>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>  };
>>
>>  void ovs_feature_support_destroy(void);
>> diff --git a/lib/actions.c b/lib/actions.c
>> index 037172e606..9d52cb75a8 100644
>> --- a/lib/actions.c
>> +++ b/lib/actions.c
>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select *select,
>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>
>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits of
>> +         * selection_method_param to 1: */
>>
> 
> This comment is a bit unfortunate, because it reads like you want to set
> all bits of the upper half
> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to value
> 1 (1 << 32)." during merge, wdyt?
> 
> 

Makes sense, thanks for the suggestion!  I changed it accordingly.

>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>> +    }
>> +
>>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
>>
>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) {
>> diff --git a/lib/features.c b/lib/features.c
>> index 97c9c86f00..d24e8f6c5c 100644
>> --- a/lib/features.c
>> +++ b/lib/features.c
>> @@ -21,6 +21,7 @@
>>  #include "lib/dirs.h"
>>  #include "socket-util.h"
>>  #include "lib/vswitch-idl.h"
>> +#include "odp-netlink.h"
>>  #include "openvswitch/vlog.h"
>>  #include "openvswitch/ofpbuf.h"
>>  #include "openvswitch/rconn.h"
>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>
>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>
>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
>> + * type of capability is supported or not. */
>> +typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities,
>> +                                    const char *cap_name);
>> +
>>  struct ovs_feature {
>>      enum ovs_feature_value value;
>>      const char *name;
>> +    ovs_feature_parse_func *parse;
>>  };
>>
>> +static bool
>> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name)
>> +{
>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>> +}
>> +
>> +static bool
>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>> +                              const char *cap_name OVS_UNUSED)
>> +{
>> +    int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0);
>> +
>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>> +}
>> +
>>  static struct ovs_feature all_ovs_features[] = {
>>      {
>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>> -        .name = "ct_zero_snat"
>> +        .name = "ct_zero_snat",
>> +        .parse = bool_parser,
>>      },
>>      {
>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>> -        .name = "ct_flush"
>> -    }
>> +        .name = "ct_flush",
>> +        .parse = bool_parser,
>> +    },
>> +    {
>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>> +        .name = "dp_hash_l4_sym_support",
>> +        .parse = dp_hash_l4_sym_support_parser,
>> +    },
>>  };
>>
>>  /* A bitmap of OVS features that have been detected as 'supported'. */
>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature)
>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>      case OVS_DP_METER_SUPPORT:
>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>          return true;
>>      default:
>>          return false;
>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>> *ovs_capabilities,
>>      }
>>
>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>> -        const char *name = all_ovs_features[i].name;
>> -        bool old_state = supported_ovs_features & value;
>> -        bool new_state = smap_get_bool(ovs_capabilities, name, false);
>> +        struct ovs_feature *feature = &all_ovs_features[i];
>> +        bool old_state = supported_ovs_features & feature->value;
>> +        bool new_state = feature->parse(ovs_capabilities, feature->name);
>>          if (new_state != old_state) {
>>              updated = true;
>>              if (new_state) {
>> -                supported_ovs_features |= value;
>> +                supported_ovs_features |= feature->value;
>>              } else {
>> -                supported_ovs_features &= ~value;
>> +                supported_ovs_features &= ~feature->value;
>>              }
>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name,
>>                           new_state ? "supported" : "not supported");
>>          }
>>      }
>> --
>> 2.31.1
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
> Other than that it looks good.
> 
> Acked-by: Ales Musil <amusil@redhat.com>
> 

Thanks, I applied this to main.

I was wondering if we should backport this though.  It's not exactly a
bug fix but it does avoid using a broken datapath feature (when
possible) so it avoids a broken behavior.

The changes are quite contained.

CC-ing some of the maintainers explicitly to get some more eyes on this.

Regards,
Dumitru
Han Zhou July 18, 2023, 10:14 a.m. UTC | #3
On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 7/14/23 08:41, Ales Musil wrote:
> > On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote:
> >
> >> Regular dp-hash is not a canonical L4 hash (at least with the netlink
> >> datapath).  If the datapath supports l4 symmetrical dp-hash use that
one
> >> instead.
> >>
> >> Reported-at: https://github.com/ovn-org/ovn/issues/112
> >> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
> >> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> >>
> >
> > Hi Dumitru,
> >
> >
> >> ---
> >>  include/ovn/features.h |  2 ++
> >>  lib/actions.c          |  6 ++++++
> >>  lib/features.c         | 49 +++++++++++++++++++++++++++++++++---------
> >>  3 files changed, 47 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/include/ovn/features.h b/include/ovn/features.h
> >> index de8f1f5489..3bf536127f 100644
> >> --- a/include/ovn/features.h
> >> +++ b/include/ovn/features.h
> >> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
> >>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
> >>      OVS_DP_METER_SUPPORT_BIT,
> >>      OVS_CT_TUPLE_FLUSH_BIT,
> >> +    OVS_DP_HASH_L4_SYM_BIT,
> >>  };
> >>
> >>  enum ovs_feature_value {
> >>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
> >>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
> >>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
> >> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
> >>  };
> >>
> >>  void ovs_feature_support_destroy(void);
> >> diff --git a/lib/actions.c b/lib/actions.c
> >> index 037172e606..9d52cb75a8 100644
> >> --- a/lib/actions.c
> >> +++ b/lib/actions.c
> >> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
*select,
> >>      struct ds ds = DS_EMPTY_INITIALIZER;
> >>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
> >>
> >> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
> >> +        /* Select dp-hash l4_symmetric by setting the upper 32bits of
> >> +         * selection_method_param to 1: */
> >>
> >
> > This comment is a bit unfortunate, because it reads like you want to set
> > all bits of the upper half
> > to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
value
> > 1 (1 << 32)." during merge, wdyt?
> >
> >
>
> Makes sense, thanks for the suggestion!  I changed it accordingly.
>
> >> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
> >> +    }
> >> +
> >>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
> >>
> >>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
bucket_id++) {
> >> diff --git a/lib/features.c b/lib/features.c
> >> index 97c9c86f00..d24e8f6c5c 100644
> >> --- a/lib/features.c
> >> +++ b/lib/features.c
> >> @@ -21,6 +21,7 @@
> >>  #include "lib/dirs.h"
> >>  #include "socket-util.h"
> >>  #include "lib/vswitch-idl.h"
> >> +#include "odp-netlink.h"
> >>  #include "openvswitch/vlog.h"
> >>  #include "openvswitch/ofpbuf.h"
> >>  #include "openvswitch/rconn.h"
> >> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
> >>
> >>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
> >>
> >> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
> >> + * type of capability is supported or not. */
> >> +typedef bool ovs_feature_parse_func(const struct smap
*ovs_capabilities,
> >> +                                    const char *cap_name);
> >> +
> >>  struct ovs_feature {
> >>      enum ovs_feature_value value;
> >>      const char *name;
> >> +    ovs_feature_parse_func *parse;
> >>  };
> >>
> >> +static bool
> >> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name)
> >> +{
> >> +    return smap_get_bool(ovs_capabilities, cap_name, false);
> >> +}
> >> +
> >> +static bool
> >> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
> >> +                              const char *cap_name OVS_UNUSED)
> >> +{
> >> +    int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg",
0);
> >> +
> >> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
> >> +}
> >> +
> >>  static struct ovs_feature all_ovs_features[] = {
> >>      {
> >>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
> >> -        .name = "ct_zero_snat"
> >> +        .name = "ct_zero_snat",
> >> +        .parse = bool_parser,
> >>      },
> >>      {
> >>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
> >> -        .name = "ct_flush"
> >> -    }
> >> +        .name = "ct_flush",
> >> +        .parse = bool_parser,
> >> +    },
> >> +    {
> >> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
> >> +        .name = "dp_hash_l4_sym_support",
> >> +        .parse = dp_hash_l4_sym_support_parser,
> >> +    },
> >>  };
> >>
> >>  /* A bitmap of OVS features that have been detected as 'supported'. */
> >> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature)
> >>      case OVS_CT_ZERO_SNAT_SUPPORT:
> >>      case OVS_DP_METER_SUPPORT:
> >>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
> >> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
> >>          return true;
> >>      default:
> >>          return false;
> >> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
> >> *ovs_capabilities,
> >>      }
> >>
> >>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
> >> -        enum ovs_feature_value value = all_ovs_features[i].value;
> >> -        const char *name = all_ovs_features[i].name;
> >> -        bool old_state = supported_ovs_features & value;
> >> -        bool new_state = smap_get_bool(ovs_capabilities, name, false);
> >> +        struct ovs_feature *feature = &all_ovs_features[i];
> >> +        bool old_state = supported_ovs_features & feature->value;
> >> +        bool new_state = feature->parse(ovs_capabilities,
feature->name);
> >>          if (new_state != old_state) {
> >>              updated = true;
> >>              if (new_state) {
> >> -                supported_ovs_features |= value;
> >> +                supported_ovs_features |= feature->value;
> >>              } else {
> >> -                supported_ovs_features &= ~value;
> >> +                supported_ovs_features &= ~feature->value;
> >>              }
> >> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
> >> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
feature->name,
> >>                           new_state ? "supported" : "not supported");
> >>          }
> >>      }
> >> --
> >> 2.31.1
> >>
> >> _______________________________________________
> >> dev mailing list
> >> dev@openvswitch.org
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>
> >>
> > Other than that it looks good.
> >
> > Acked-by: Ales Musil <amusil@redhat.com>
> >
>
> Thanks, I applied this to main.
>
> I was wondering if we should backport this though.  It's not exactly a
> bug fix but it does avoid using a broken datapath feature (when
> possible) so it avoids a broken behavior.
>
> The changes are quite contained.
>
> CC-ing some of the maintainers explicitly to get some more eyes on this.
>
Thanks Dumitru. Sorry that I was following the reported issue. I tried to
look for some more details, but I am not authorized to access bug 2175716.
It is not quite clear to me what the impact of this problem is. The OVN
issue (112) seems to be a very common configuration of OVN ECMP and I
assume the problem is not easily reproduced because I have been
testing/deploying the feature and it worked as expected. Do you have more
details about the trigger? Are there any related fixes in OVS and kernel?

Regards,
Han

> Regards,
> Dumitru
>
>
>
>
Dumitru Ceara July 18, 2023, 1:16 p.m. UTC | #4
On 7/18/23 12:14, Han Zhou wrote:
> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On 7/14/23 08:41, Ales Musil wrote:
>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com> wrote:
>>>
>>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink
>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use that
> one
>>>> instead.
>>>>
>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>
>>>
>>> Hi Dumitru,
>>>
>>>
>>>> ---
>>>>  include/ovn/features.h |  2 ++
>>>>  lib/actions.c          |  6 ++++++
>>>>  lib/features.c         | 49 +++++++++++++++++++++++++++++++++---------
>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>>>> index de8f1f5489..3bf536127f 100644
>>>> --- a/include/ovn/features.h
>>>> +++ b/include/ovn/features.h
>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>>>      OVS_DP_METER_SUPPORT_BIT,
>>>>      OVS_CT_TUPLE_FLUSH_BIT,
>>>> +    OVS_DP_HASH_L4_SYM_BIT,
>>>>  };
>>>>
>>>>  enum ovs_feature_value {
>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>>>  };
>>>>
>>>>  void ovs_feature_support_destroy(void);
>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>> index 037172e606..9d52cb75a8 100644
>>>> --- a/lib/actions.c
>>>> +++ b/lib/actions.c
>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
> *select,
>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>>>
>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits of
>>>> +         * selection_method_param to 1: */
>>>>
>>>
>>> This comment is a bit unfortunate, because it reads like you want to set
>>> all bits of the upper half
>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
> value
>>> 1 (1 << 32)." during merge, wdyt?
>>>
>>>
>>
>> Makes sense, thanks for the suggestion!  I changed it accordingly.
>>
>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>>>> +    }
>>>> +
>>>>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
>>>>
>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
> bucket_id++) {
>>>> diff --git a/lib/features.c b/lib/features.c
>>>> index 97c9c86f00..d24e8f6c5c 100644
>>>> --- a/lib/features.c
>>>> +++ b/lib/features.c
>>>> @@ -21,6 +21,7 @@
>>>>  #include "lib/dirs.h"
>>>>  #include "socket-util.h"
>>>>  #include "lib/vswitch-idl.h"
>>>> +#include "odp-netlink.h"
>>>>  #include "openvswitch/vlog.h"
>>>>  #include "openvswitch/ofpbuf.h"
>>>>  #include "openvswitch/rconn.h"
>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>>>
>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>>>
>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
>>>> + * type of capability is supported or not. */
>>>> +typedef bool ovs_feature_parse_func(const struct smap
> *ovs_capabilities,
>>>> +                                    const char *cap_name);
>>>> +
>>>>  struct ovs_feature {
>>>>      enum ovs_feature_value value;
>>>>      const char *name;
>>>> +    ovs_feature_parse_func *parse;
>>>>  };
>>>>
>>>> +static bool
>>>> +bool_parser(const struct smap *ovs_capabilities, const char *cap_name)
>>>> +{
>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>>>> +}
>>>> +
>>>> +static bool
>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>>>> +                              const char *cap_name OVS_UNUSED)
>>>> +{
>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg",
> 0);
>>>> +
>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>>>> +}
>>>> +
>>>>  static struct ovs_feature all_ovs_features[] = {
>>>>      {
>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>>>> -        .name = "ct_zero_snat"
>>>> +        .name = "ct_zero_snat",
>>>> +        .parse = bool_parser,
>>>>      },
>>>>      {
>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>>>> -        .name = "ct_flush"
>>>> -    }
>>>> +        .name = "ct_flush",
>>>> +        .parse = bool_parser,
>>>> +    },
>>>> +    {
>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>>>> +        .name = "dp_hash_l4_sym_support",
>>>> +        .parse = dp_hash_l4_sym_support_parser,
>>>> +    },
>>>>  };
>>>>
>>>>  /* A bitmap of OVS features that have been detected as 'supported'. */
>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value feature)
>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>>>      case OVS_DP_METER_SUPPORT:
>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>>>          return true;
>>>>      default:
>>>>          return false;
>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>>>> *ovs_capabilities,
>>>>      }
>>>>
>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>>>> -        const char *name = all_ovs_features[i].name;
>>>> -        bool old_state = supported_ovs_features & value;
>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name, false);
>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
>>>> +        bool old_state = supported_ovs_features & feature->value;
>>>> +        bool new_state = feature->parse(ovs_capabilities,
> feature->name);
>>>>          if (new_state != old_state) {
>>>>              updated = true;
>>>>              if (new_state) {
>>>> -                supported_ovs_features |= value;
>>>> +                supported_ovs_features |= feature->value;
>>>>              } else {
>>>> -                supported_ovs_features &= ~value;
>>>> +                supported_ovs_features &= ~feature->value;
>>>>              }
>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
> feature->name,
>>>>                           new_state ? "supported" : "not supported");
>>>>          }
>>>>      }
>>>> --
>>>> 2.31.1
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>
>>>>
>>> Other than that it looks good.
>>>
>>> Acked-by: Ales Musil <amusil@redhat.com>
>>>
>>
>> Thanks, I applied this to main.
>>
>> I was wondering if we should backport this though.  It's not exactly a
>> bug fix but it does avoid using a broken datapath feature (when
>> possible) so it avoids a broken behavior.
>>
>> The changes are quite contained.
>>
>> CC-ing some of the maintainers explicitly to get some more eyes on this.
>>
> Thanks Dumitru. Sorry that I was following the reported issue. I tried to
> look for some more details, but I am not authorized to access bug 2175716.

Sorry about that, that bug used to be public but some internal policies
made it private.

> It is not quite clear to me what the impact of this problem is. The OVN
> issue (112) seems to be a very common configuration of OVN ECMP and I
> assume the problem is not easily reproduced because I have been
> testing/deploying the feature and it worked as expected. Do you have more
> details about the trigger? Are there any related fixes in OVS and kernel?
> 

The original bug report came from OpenShift:
https://issues.redhat.com/browse/OCPBUGS-7406

CC-ing Patryk as he root caused the original issue.

The simplified topology is:
                              +--- GW1 ---+
client ---- cluster-router ---+           +--- (out of OVN) --- server
                              +--- GW2 ---+

On cluster-router there's an ECMP route to reach 'server' via:
- GW1
- GW2

On GW1 traffic going out of OVN is SNATed to IP1.
On GW2 traffic going out of OVN is SNATed to IP2.

The client initiates a TCP session towards the server; a path is chosen
based on dp-hash, e.g. GW2.

From this point on, during normal operation, traffic on this session
flows fine via GW2.

If the client closes the connection, then for the closing connection
there's an exception when computing the dp-hash if the socket is in
state TIME_WAIT, substate TCP_FIN_WAIT_2:
https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
In this last case we call:

tcp_v4_timewait_ack(sk, skb)
|
tcp_v4_send_ack(sk, skb, ...)

The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet:
https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925

	ctl_sk = this_cpu_read(ipv4_tcp_sk);
	sock_net_set(ctl_sk, net);
	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
			   inet_twsk(sk)->tw_mark : sk->sk_mark;
	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
			   inet_twsk(sk)->tw_priority : sk->sk_priority;
	transmit_time = tcp_transmit_time(sk);
	ip_send_unicast_reply(ctl_sk,
			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
			      &arg, arg.iov[0].iov_len,
			      transmit_time);

As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is
computed from the packet contents:
https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
	if (!skb->l4_hash && !skb->sw_hash)
		__skb_get_hash(skb);

	return skb->hash;
}
[...]
void __skb_get_hash(struct sk_buff *skb)
{
	struct flow_keys keys;
	u32 hash;

	__flow_hash_secret_init();

	hash = ___skb_get_hash(skb, &keys, &hashrnd);

	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}

But the original socket's txhash was actually a random hash:
https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128

static inline void sk_set_txhash(struct sock *sk)
{
	/* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
	WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

This means that the last ack might enter the ovs datapath with a
different dp_hash than all other packets sent on that session in that
direction. It's possible that this last ack gets hashed to a different
path of the ECMP route.

This specific issue was fixed in the kernel through:

https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4

However, while debugging we realized the problem is actually visible
with _any_ retransmit.  That's because a retransmit will trigger a rehash:

https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124

int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
	struct flowi fl;
	int res;

	/* Paired with WRITE_ONCE() in sock_setsockopt() */
	if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
		tcp_rsk(req)->txhash = net_tx_rndhash();
	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
				  NULL);
[...]

So suddenly all subsequent packets (skbs) will have a different 'hash'
value and might get hashed to a different path.

This last part was not fixed.  After a discussion on the netdev mailing
list the conclusion was to not change the hash behavior for locally
terminated TCP sessions:

https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd

Eventually the documentation change patch was dropped and a v2 of the
other fixes was accepted:
https://www.spinics.net/lists/netdev/msg906252.html

The rehash on retransmit problem is still present though making the
kernel's dp-hash implementation unusable.

During some internal discussions Aaron (in CC) pointed out that OVS
supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
already reports the maximum supported hashing algorithm through the
Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
that the kernel datapath didn't support L4 symmetric hashing.  But Aaron
posted a patch to implement that and the patch got accepted:

https://www.spinics.net/lists/netdev/msg911045.html

Then we moved OVN to use L4 symmetric hashing if available.  That's to
ensure correctness of the packet processing.  We did run performance
tests and the hit due to the extra hashing was acceptable, max 3%
throughput degradation in OVN scenarios that simulate common traffic
patterns (like ovn-kubernetes ones) - in most cases the performance
degradation was around 2%.

Sorry for the long story but I hope it provides all the required
context.  Thinking more about it I should've probably added some of this
to the original commit log, I apologize.

Regards,
Dumitru
Han Zhou Aug. 1, 2023, 10:51 a.m. UTC | #5
On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 7/18/23 12:14, Han Zhou wrote:
> > On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote:
> >>
> >> On 7/14/23 08:41, Ales Musil wrote:
> >>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
wrote:
> >>>
> >>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink
> >>>> datapath).  If the datapath supports l4 symmetrical dp-hash use that
> > one
> >>>> instead.
> >>>>
> >>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
> >>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
> >>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> >>>>
> >>>
> >>> Hi Dumitru,
> >>>
> >>>
> >>>> ---
> >>>>  include/ovn/features.h |  2 ++
> >>>>  lib/actions.c          |  6 ++++++
> >>>>  lib/features.c         | 49
+++++++++++++++++++++++++++++++++---------
> >>>>  3 files changed, 47 insertions(+), 10 deletions(-)
> >>>>
> >>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
> >>>> index de8f1f5489..3bf536127f 100644
> >>>> --- a/include/ovn/features.h
> >>>> +++ b/include/ovn/features.h
> >>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
> >>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
> >>>>      OVS_DP_METER_SUPPORT_BIT,
> >>>>      OVS_CT_TUPLE_FLUSH_BIT,
> >>>> +    OVS_DP_HASH_L4_SYM_BIT,
> >>>>  };
> >>>>
> >>>>  enum ovs_feature_value {
> >>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
> >>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
> >>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
> >>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
> >>>>  };
> >>>>
> >>>>  void ovs_feature_support_destroy(void);
> >>>> diff --git a/lib/actions.c b/lib/actions.c
> >>>> index 037172e606..9d52cb75a8 100644
> >>>> --- a/lib/actions.c
> >>>> +++ b/lib/actions.c
> >>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
> > *select,
> >>>>      struct ds ds = DS_EMPTY_INITIALIZER;
> >>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
> >>>>
> >>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
> >>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
of
> >>>> +         * selection_method_param to 1: */
> >>>>
> >>>
> >>> This comment is a bit unfortunate, because it reads like you want to
set
> >>> all bits of the upper half
> >>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
> > value
> >>> 1 (1 << 32)." during merge, wdyt?
> >>>
> >>>
> >>
> >> Makes sense, thanks for the suggestion!  I changed it accordingly.
> >>
> >>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
> >>>> +    }
> >>>> +
> >>>>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
> >>>>
> >>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
> > bucket_id++) {
> >>>> diff --git a/lib/features.c b/lib/features.c
> >>>> index 97c9c86f00..d24e8f6c5c 100644
> >>>> --- a/lib/features.c
> >>>> +++ b/lib/features.c
> >>>> @@ -21,6 +21,7 @@
> >>>>  #include "lib/dirs.h"
> >>>>  #include "socket-util.h"
> >>>>  #include "lib/vswitch-idl.h"
> >>>> +#include "odp-netlink.h"
> >>>>  #include "openvswitch/vlog.h"
> >>>>  #include "openvswitch/ofpbuf.h"
> >>>>  #include "openvswitch/rconn.h"
> >>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
> >>>>
> >>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
> >>>>
> >>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
> >>>> + * type of capability is supported or not. */
> >>>> +typedef bool ovs_feature_parse_func(const struct smap
> > *ovs_capabilities,
> >>>> +                                    const char *cap_name);
> >>>> +
> >>>>  struct ovs_feature {
> >>>>      enum ovs_feature_value value;
> >>>>      const char *name;
> >>>> +    ovs_feature_parse_func *parse;
> >>>>  };
> >>>>
> >>>> +static bool
> >>>> +bool_parser(const struct smap *ovs_capabilities, const char
*cap_name)
> >>>> +{
> >>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
> >>>> +}
> >>>> +
> >>>> +static bool
> >>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
> >>>> +                              const char *cap_name OVS_UNUSED)
> >>>> +{
> >>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
"max_hash_alg",
> > 0);
> >>>> +
> >>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
> >>>> +}
> >>>> +
> >>>>  static struct ovs_feature all_ovs_features[] = {
> >>>>      {
> >>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
> >>>> -        .name = "ct_zero_snat"
> >>>> +        .name = "ct_zero_snat",
> >>>> +        .parse = bool_parser,
> >>>>      },
> >>>>      {
> >>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
> >>>> -        .name = "ct_flush"
> >>>> -    }
> >>>> +        .name = "ct_flush",
> >>>> +        .parse = bool_parser,
> >>>> +    },
> >>>> +    {
> >>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
> >>>> +        .name = "dp_hash_l4_sym_support",
> >>>> +        .parse = dp_hash_l4_sym_support_parser,
> >>>> +    },
> >>>>  };
> >>>>
> >>>>  /* A bitmap of OVS features that have been detected as 'supported'.
*/
> >>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
feature)
> >>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
> >>>>      case OVS_DP_METER_SUPPORT:
> >>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
> >>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
> >>>>          return true;
> >>>>      default:
> >>>>          return false;
> >>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
> >>>> *ovs_capabilities,
> >>>>      }
> >>>>
> >>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
> >>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
> >>>> -        const char *name = all_ovs_features[i].name;
> >>>> -        bool old_state = supported_ovs_features & value;
> >>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
false);
> >>>> +        struct ovs_feature *feature = &all_ovs_features[i];
> >>>> +        bool old_state = supported_ovs_features & feature->value;
> >>>> +        bool new_state = feature->parse(ovs_capabilities,
> > feature->name);
> >>>>          if (new_state != old_state) {
> >>>>              updated = true;
> >>>>              if (new_state) {
> >>>> -                supported_ovs_features |= value;
> >>>> +                supported_ovs_features |= feature->value;
> >>>>              } else {
> >>>> -                supported_ovs_features &= ~value;
> >>>> +                supported_ovs_features &= ~feature->value;
> >>>>              }
> >>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
> >>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
> > feature->name,
> >>>>                           new_state ? "supported" : "not supported");
> >>>>          }
> >>>>      }
> >>>> --
> >>>> 2.31.1
> >>>>
> >>>> _______________________________________________
> >>>> dev mailing list
> >>>> dev@openvswitch.org
> >>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>>
> >>>>
> >>> Other than that it looks good.
> >>>
> >>> Acked-by: Ales Musil <amusil@redhat.com>
> >>>
> >>
> >> Thanks, I applied this to main.
> >>
> >> I was wondering if we should backport this though.  It's not exactly a
> >> bug fix but it does avoid using a broken datapath feature (when
> >> possible) so it avoids a broken behavior.
> >>
> >> The changes are quite contained.
> >>
> >> CC-ing some of the maintainers explicitly to get some more eyes on
this.
> >>
> > Thanks Dumitru. Sorry that I was following the reported issue. I tried
to
> > look for some more details, but I am not authorized to access bug
2175716.
>
> Sorry about that, that bug used to be public but some internal policies
> made it private.
>
> > It is not quite clear to me what the impact of this problem is. The OVN
> > issue (112) seems to be a very common configuration of OVN ECMP and I
> > assume the problem is not easily reproduced because I have been
> > testing/deploying the feature and it worked as expected. Do you have
more
> > details about the trigger? Are there any related fixes in OVS and
kernel?
> >
>
> The original bug report came from OpenShift:
> https://issues.redhat.com/browse/OCPBUGS-7406
>
> CC-ing Patryk as he root caused the original issue.
>
> The simplified topology is:
>                               +--- GW1 ---+
> client ---- cluster-router ---+           +--- (out of OVN) --- server
>                               +--- GW2 ---+
>
> On cluster-router there's an ECMP route to reach 'server' via:
> - GW1
> - GW2
>
> On GW1 traffic going out of OVN is SNATed to IP1.
> On GW2 traffic going out of OVN is SNATed to IP2.
>
> The client initiates a TCP session towards the server; a path is chosen
> based on dp-hash, e.g. GW2.
>
> From this point on, during normal operation, traffic on this session
> flows fine via GW2.
>
> If the client closes the connection, then for the closing connection
> there's an exception when computing the dp-hash if the socket is in
> state TIME_WAIT, substate TCP_FIN_WAIT_2:
> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
> In this last case we call:
>
> tcp_v4_timewait_ack(sk, skb)
> |
> tcp_v4_send_ack(sk, skb, ...)
>
> The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet:
> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
>
>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
>         sock_net_set(ctl_sk, net);
>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
>                            inet_twsk(sk)->tw_priority : sk->sk_priority;
>         transmit_time = tcp_transmit_time(sk);
>         ip_send_unicast_reply(ctl_sk,
>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>                               &arg, arg.iov[0].iov_len,
>                               transmit_time);
>
> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is
> computed from the packet contents:
>
https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
>
> static inline __u32 skb_get_hash(struct sk_buff *skb)
> {
>         if (!skb->l4_hash && !skb->sw_hash)
>                 __skb_get_hash(skb);
>
>         return skb->hash;
> }
> [...]
> void __skb_get_hash(struct sk_buff *skb)
> {
>         struct flow_keys keys;
>         u32 hash;
>
>         __flow_hash_secret_init();
>
>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
>
>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
> }
>
> But the original socket's txhash was actually a random hash:
> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
>
> static inline void sk_set_txhash(struct sock *sk)
> {
>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
> }
>
> This means that the last ack might enter the ovs datapath with a
> different dp_hash than all other packets sent on that session in that
> direction. It's possible that this last ack gets hashed to a different
> path of the ECMP route.
>
> This specific issue was fixed in the kernel through:
>
>
https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
>
> However, while debugging we realized the problem is actually visible
> with _any_ retransmit.  That's because a retransmit will trigger a rehash:
>
> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
>
> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
> {
>         const struct tcp_request_sock_ops *af_ops =
tcp_rsk(req)->af_specific;
>         struct flowi fl;
>         int res;
>
>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
>                 tcp_rsk(req)->txhash = net_tx_rndhash();
>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
TCP_SYNACK_NORMAL,
>                                   NULL);
> [...]
>
> So suddenly all subsequent packets (skbs) will have a different 'hash'
> value and might get hashed to a different path.
>
> This last part was not fixed.  After a discussion on the netdev mailing
> list the conclusion was to not change the hash behavior for locally
> terminated TCP sessions:
>
>
https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
>
> Eventually the documentation change patch was dropped and a v2 of the
> other fixes was accepted:
> https://www.spinics.net/lists/netdev/msg906252.html
>
> The rehash on retransmit problem is still present though making the
> kernel's dp-hash implementation unusable.
>
> During some internal discussions Aaron (in CC) pointed out that OVS
> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
> already reports the maximum supported hashing algorithm through the
> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
> that the kernel datapath didn't support L4 symmetric hashing.  But Aaron
> posted a patch to implement that and the patch got accepted:
>
> https://www.spinics.net/lists/netdev/msg911045.html
>
> Then we moved OVN to use L4 symmetric hashing if available.  That's to
> ensure correctness of the packet processing.  We did run performance
> tests and the hit due to the extra hashing was acceptable, max 3%
> throughput degradation in OVN scenarios that simulate common traffic
> patterns (like ovn-kubernetes ones) - in most cases the performance
> degradation was around 2%.
>
> Sorry for the long story but I hope it provides all the required
> context.  Thinking more about it I should've probably added some of this
> to the original commit log, I apologize.
>
> Regards,
> Dumitru
>

Thanks Dumitru for the details! It is now quite clear and I am not against
backporting it.
But I have a question regarding upgrading. For the existing traffic, does
this mean the packets may choose a different path after OVN upgrading?
Should this be called out?

Regards,
Han
Dumitru Ceara Aug. 11, 2023, 1:23 p.m. UTC | #6
On 8/1/23 12:51, Han Zhou wrote:
> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On 7/18/23 12:14, Han Zhou wrote:
>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com> wrote:
>>>>
>>>> On 7/14/23 08:41, Ales Musil wrote:
>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
> wrote:
>>>>>
>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the netlink
>>>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use that
>>> one
>>>>>> instead.
>>>>>>
>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>>>
>>>>>
>>>>> Hi Dumitru,
>>>>>
>>>>>
>>>>>> ---
>>>>>>  include/ovn/features.h |  2 ++
>>>>>>  lib/actions.c          |  6 ++++++
>>>>>>  lib/features.c         | 49
> +++++++++++++++++++++++++++++++++---------
>>>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>>>>>> index de8f1f5489..3bf536127f 100644
>>>>>> --- a/include/ovn/features.h
>>>>>> +++ b/include/ovn/features.h
>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>>>>>      OVS_DP_METER_SUPPORT_BIT,
>>>>>>      OVS_CT_TUPLE_FLUSH_BIT,
>>>>>> +    OVS_DP_HASH_L4_SYM_BIT,
>>>>>>  };
>>>>>>
>>>>>>  enum ovs_feature_value {
>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>>>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>>>>>  };
>>>>>>
>>>>>>  void ovs_feature_support_destroy(void);
>>>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>>>> index 037172e606..9d52cb75a8 100644
>>>>>> --- a/lib/actions.c
>>>>>> +++ b/lib/actions.c
>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
>>> *select,
>>>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>>>>>
>>>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>>>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
> of
>>>>>> +         * selection_method_param to 1: */
>>>>>>
>>>>>
>>>>> This comment is a bit unfortunate, because it reads like you want to
> set
>>>>> all bits of the upper half
>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
>>> value
>>>>> 1 (1 << 32)." during merge, wdyt?
>>>>>
>>>>>
>>>>
>>>> Makes sense, thanks for the suggestion!  I changed it accordingly.
>>>>
>>>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>>>>>> +    }
>>>>>> +
>>>>>>      struct mf_subfield sf = expr_resolve_field(&select->res_field);
>>>>>>
>>>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
>>> bucket_id++) {
>>>>>> diff --git a/lib/features.c b/lib/features.c
>>>>>> index 97c9c86f00..d24e8f6c5c 100644
>>>>>> --- a/lib/features.c
>>>>>> +++ b/lib/features.c
>>>>>> @@ -21,6 +21,7 @@
>>>>>>  #include "lib/dirs.h"
>>>>>>  #include "socket-util.h"
>>>>>>  #include "lib/vswitch-idl.h"
>>>>>> +#include "odp-netlink.h"
>>>>>>  #include "openvswitch/vlog.h"
>>>>>>  #include "openvswitch/ofpbuf.h"
>>>>>>  #include "openvswitch/rconn.h"
>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>>>>>
>>>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>>>>>
>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
>>>>>> + * type of capability is supported or not. */
>>>>>> +typedef bool ovs_feature_parse_func(const struct smap
>>> *ovs_capabilities,
>>>>>> +                                    const char *cap_name);
>>>>>> +
>>>>>>  struct ovs_feature {
>>>>>>      enum ovs_feature_value value;
>>>>>>      const char *name;
>>>>>> +    ovs_feature_parse_func *parse;
>>>>>>  };
>>>>>>
>>>>>> +static bool
>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char
> *cap_name)
>>>>>> +{
>>>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>>>>>> +}
>>>>>> +
>>>>>> +static bool
>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>>>>>> +                              const char *cap_name OVS_UNUSED)
>>>>>> +{
>>>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
> "max_hash_alg",
>>> 0);
>>>>>> +
>>>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>>>>>> +}
>>>>>> +
>>>>>>  static struct ovs_feature all_ovs_features[] = {
>>>>>>      {
>>>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>>>>>> -        .name = "ct_zero_snat"
>>>>>> +        .name = "ct_zero_snat",
>>>>>> +        .parse = bool_parser,
>>>>>>      },
>>>>>>      {
>>>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>>>>>> -        .name = "ct_flush"
>>>>>> -    }
>>>>>> +        .name = "ct_flush",
>>>>>> +        .parse = bool_parser,
>>>>>> +    },
>>>>>> +    {
>>>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>>>>>> +        .name = "dp_hash_l4_sym_support",
>>>>>> +        .parse = dp_hash_l4_sym_support_parser,
>>>>>> +    },
>>>>>>  };
>>>>>>
>>>>>>  /* A bitmap of OVS features that have been detected as 'supported'.
> */
>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
> feature)
>>>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>>>>>      case OVS_DP_METER_SUPPORT:
>>>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>>>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>>>>>          return true;
>>>>>>      default:
>>>>>>          return false;
>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>>>>>> *ovs_capabilities,
>>>>>>      }
>>>>>>
>>>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>>>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>>>>>> -        const char *name = all_ovs_features[i].name;
>>>>>> -        bool old_state = supported_ovs_features & value;
>>>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
> false);
>>>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
>>>>>> +        bool old_state = supported_ovs_features & feature->value;
>>>>>> +        bool new_state = feature->parse(ovs_capabilities,
>>> feature->name);
>>>>>>          if (new_state != old_state) {
>>>>>>              updated = true;
>>>>>>              if (new_state) {
>>>>>> -                supported_ovs_features |= value;
>>>>>> +                supported_ovs_features |= feature->value;
>>>>>>              } else {
>>>>>> -                supported_ovs_features &= ~value;
>>>>>> +                supported_ovs_features &= ~feature->value;
>>>>>>              }
>>>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>>>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
>>> feature->name,
>>>>>>                           new_state ? "supported" : "not supported");
>>>>>>          }
>>>>>>      }
>>>>>> --
>>>>>> 2.31.1
>>>>>>
>>>>>> _______________________________________________
>>>>>> dev mailing list
>>>>>> dev@openvswitch.org
>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>>
>>>>>>
>>>>> Other than that it looks good.
>>>>>
>>>>> Acked-by: Ales Musil <amusil@redhat.com>
>>>>>
>>>>
>>>> Thanks, I applied this to main.
>>>>
>>>> I was wondering if we should backport this though.  It's not exactly a
>>>> bug fix but it does avoid using a broken datapath feature (when
>>>> possible) so it avoids a broken behavior.
>>>>
>>>> The changes are quite contained.
>>>>
>>>> CC-ing some of the maintainers explicitly to get some more eyes on
> this.
>>>>
>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried
> to
>>> look for some more details, but I am not authorized to access bug
> 2175716.
>>
>> Sorry about that, that bug used to be public but some internal policies
>> made it private.
>>
>>> It is not quite clear to me what the impact of this problem is. The OVN
>>> issue (112) seems to be a very common configuration of OVN ECMP and I
>>> assume the problem is not easily reproduced because I have been
>>> testing/deploying the feature and it worked as expected. Do you have
> more
>>> details about the trigger? Are there any related fixes in OVS and
> kernel?
>>>
>>
>> The original bug report came from OpenShift:
>> https://issues.redhat.com/browse/OCPBUGS-7406
>>
>> CC-ing Patryk as he root caused the original issue.
>>
>> The simplified topology is:
>>                               +--- GW1 ---+
>> client ---- cluster-router ---+           +--- (out of OVN) --- server
>>                               +--- GW2 ---+
>>
>> On cluster-router there's an ECMP route to reach 'server' via:
>> - GW1
>> - GW2
>>
>> On GW1 traffic going out of OVN is SNATed to IP1.
>> On GW2 traffic going out of OVN is SNATed to IP2.
>>
>> The client initiates a TCP session towards the server; a path is chosen
>> based on dp-hash, e.g. GW2.
>>
>> From this point on, during normal operation, traffic on this session
>> flows fine via GW2.
>>
>> If the client closes the connection, then for the closing connection
>> there's an exception when computing the dp-hash if the socket is in
>> state TIME_WAIT, substate TCP_FIN_WAIT_2:
>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
>> In this last case we call:
>>
>> tcp_v4_timewait_ack(sk, skb)
>> |
>> tcp_v4_send_ack(sk, skb, ...)
>>
>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK packet:
>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
>>
>>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
>>         sock_net_set(ctl_sk, net);
>>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
>>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
>>                            inet_twsk(sk)->tw_priority : sk->sk_priority;
>>         transmit_time = tcp_transmit_time(sk);
>>         ip_send_unicast_reply(ctl_sk,
>>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
>>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>>                               &arg, arg.iov[0].iov_len,
>>                               transmit_time);
>>
>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash is
>> computed from the packet contents:
>>
> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
>>
>> static inline __u32 skb_get_hash(struct sk_buff *skb)
>> {
>>         if (!skb->l4_hash && !skb->sw_hash)
>>                 __skb_get_hash(skb);
>>
>>         return skb->hash;
>> }
>> [...]
>> void __skb_get_hash(struct sk_buff *skb)
>> {
>>         struct flow_keys keys;
>>         u32 hash;
>>
>>         __flow_hash_secret_init();
>>
>>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
>>
>>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
>> }
>>
>> But the original socket's txhash was actually a random hash:
>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
>>
>> static inline void sk_set_txhash(struct sock *sk)
>> {
>>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
>>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
>> }
>>
>> This means that the last ack might enter the ovs datapath with a
>> different dp_hash than all other packets sent on that session in that
>> direction. It's possible that this last ack gets hashed to a different
>> path of the ECMP route.
>>
>> This specific issue was fixed in the kernel through:
>>
>>
> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
>>
>> However, while debugging we realized the problem is actually visible
>> with _any_ retransmit.  That's because a retransmit will trigger a rehash:
>>
>> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
>>
>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
>> {
>>         const struct tcp_request_sock_ops *af_ops =
> tcp_rsk(req)->af_specific;
>>         struct flowi fl;
>>         int res;
>>
>>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
>>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
>>                 tcp_rsk(req)->txhash = net_tx_rndhash();
>>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
> TCP_SYNACK_NORMAL,
>>                                   NULL);
>> [...]
>>
>> So suddenly all subsequent packets (skbs) will have a different 'hash'
>> value and might get hashed to a different path.
>>
>> This last part was not fixed.  After a discussion on the netdev mailing
>> list the conclusion was to not change the hash behavior for locally
>> terminated TCP sessions:
>>
>>
> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
>>
>> Eventually the documentation change patch was dropped and a v2 of the
>> other fixes was accepted:
>> https://www.spinics.net/lists/netdev/msg906252.html
>>
>> The rehash on retransmit problem is still present though making the
>> kernel's dp-hash implementation unusable.
>>
>> During some internal discussions Aaron (in CC) pointed out that OVS
>> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
>> already reports the maximum supported hashing algorithm through the
>> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
>> that the kernel datapath didn't support L4 symmetric hashing.  But Aaron
>> posted a patch to implement that and the patch got accepted:
>>
>> https://www.spinics.net/lists/netdev/msg911045.html
>>
>> Then we moved OVN to use L4 symmetric hashing if available.  That's to
>> ensure correctness of the packet processing.  We did run performance
>> tests and the hit due to the extra hashing was acceptable, max 3%
>> throughput degradation in OVN scenarios that simulate common traffic
>> patterns (like ovn-kubernetes ones) - in most cases the performance
>> degradation was around 2%.
>>
>> Sorry for the long story but I hope it provides all the required
>> context.  Thinking more about it I should've probably added some of this
>> to the original commit log, I apologize.
>>
>> Regards,
>> Dumitru
>>
> 
> Thanks Dumitru for the details! It is now quite clear and I am not against
> backporting it.
> But I have a question regarding upgrading. For the existing traffic, does
> this mean the packets may choose a different path after OVN upgrading?
> Should this be called out?
> 

Hi Han,

That's a good question.  They might and this affects all traffic that's
not locally originated for the kernel datapath and (potentially) all
traffic for the userspace datapath.  I can post a follow up patch to
update the NEWS file.  Did you have anything else in mind?

Regards,
Dumitru
Han Zhou Aug. 12, 2023, 5:19 a.m. UTC | #7
On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 8/1/23 12:51, Han Zhou wrote:
> > On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
> >>
> >> On 7/18/23 12:14, Han Zhou wrote:
> >>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com>
wrote:
> >>>>
> >>>> On 7/14/23 08:41, Ales Musil wrote:
> >>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
> > wrote:
> >>>>>
> >>>>>> Regular dp-hash is not a canonical L4 hash (at least with the
netlink
> >>>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use
that
> >>> one
> >>>>>> instead.
> >>>>>>
> >>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
> >>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
> >>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> >>>>>>
> >>>>>
> >>>>> Hi Dumitru,
> >>>>>
> >>>>>
> >>>>>> ---
> >>>>>>  include/ovn/features.h |  2 ++
> >>>>>>  lib/actions.c          |  6 ++++++
> >>>>>>  lib/features.c         | 49
> > +++++++++++++++++++++++++++++++++---------
> >>>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
> >>>>>>
> >>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
> >>>>>> index de8f1f5489..3bf536127f 100644
> >>>>>> --- a/include/ovn/features.h
> >>>>>> +++ b/include/ovn/features.h
> >>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
> >>>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
> >>>>>>      OVS_DP_METER_SUPPORT_BIT,
> >>>>>>      OVS_CT_TUPLE_FLUSH_BIT,
> >>>>>> +    OVS_DP_HASH_L4_SYM_BIT,
> >>>>>>  };
> >>>>>>
> >>>>>>  enum ovs_feature_value {
> >>>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 <<
OVS_CT_ZERO_SNAT_SUPPORT_BIT),
> >>>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
> >>>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
> >>>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
> >>>>>>  };
> >>>>>>
> >>>>>>  void ovs_feature_support_destroy(void);
> >>>>>> diff --git a/lib/actions.c b/lib/actions.c
> >>>>>> index 037172e606..9d52cb75a8 100644
> >>>>>> --- a/lib/actions.c
> >>>>>> +++ b/lib/actions.c
> >>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
> >>> *select,
> >>>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
> >>>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
> >>>>>>
> >>>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
> >>>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
> > of
> >>>>>> +         * selection_method_param to 1: */
> >>>>>>
> >>>>>
> >>>>> This comment is a bit unfortunate, because it reads like you want to
> > set
> >>>>> all bits of the upper half
> >>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
> >>> value
> >>>>> 1 (1 << 32)." during merge, wdyt?
> >>>>>
> >>>>>
> >>>>
> >>>> Makes sense, thanks for the suggestion!  I changed it accordingly.
> >>>>
> >>>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
> >>>>>> +    }
> >>>>>> +
> >>>>>>      struct mf_subfield sf =
expr_resolve_field(&select->res_field);
> >>>>>>
> >>>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
> >>> bucket_id++) {
> >>>>>> diff --git a/lib/features.c b/lib/features.c
> >>>>>> index 97c9c86f00..d24e8f6c5c 100644
> >>>>>> --- a/lib/features.c
> >>>>>> +++ b/lib/features.c
> >>>>>> @@ -21,6 +21,7 @@
> >>>>>>  #include "lib/dirs.h"
> >>>>>>  #include "socket-util.h"
> >>>>>>  #include "lib/vswitch-idl.h"
> >>>>>> +#include "odp-netlink.h"
> >>>>>>  #include "openvswitch/vlog.h"
> >>>>>>  #include "openvswitch/ofpbuf.h"
> >>>>>>  #include "openvswitch/rconn.h"
> >>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
> >>>>>>
> >>>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
> >>>>>>
> >>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether
the
> >>>>>> + * type of capability is supported or not. */
> >>>>>> +typedef bool ovs_feature_parse_func(const struct smap
> >>> *ovs_capabilities,
> >>>>>> +                                    const char *cap_name);
> >>>>>> +
> >>>>>>  struct ovs_feature {
> >>>>>>      enum ovs_feature_value value;
> >>>>>>      const char *name;
> >>>>>> +    ovs_feature_parse_func *parse;
> >>>>>>  };
> >>>>>>
> >>>>>> +static bool
> >>>>>> +bool_parser(const struct smap *ovs_capabilities, const char
> > *cap_name)
> >>>>>> +{
> >>>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static bool
> >>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
> >>>>>> +                              const char *cap_name OVS_UNUSED)
> >>>>>> +{
> >>>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
> > "max_hash_alg",
> >>> 0);
> >>>>>> +
> >>>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
> >>>>>> +}
> >>>>>> +
> >>>>>>  static struct ovs_feature all_ovs_features[] = {
> >>>>>>      {
> >>>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
> >>>>>> -        .name = "ct_zero_snat"
> >>>>>> +        .name = "ct_zero_snat",
> >>>>>> +        .parse = bool_parser,
> >>>>>>      },
> >>>>>>      {
> >>>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
> >>>>>> -        .name = "ct_flush"
> >>>>>> -    }
> >>>>>> +        .name = "ct_flush",
> >>>>>> +        .parse = bool_parser,
> >>>>>> +    },
> >>>>>> +    {
> >>>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
> >>>>>> +        .name = "dp_hash_l4_sym_support",
> >>>>>> +        .parse = dp_hash_l4_sym_support_parser,
> >>>>>> +    },
> >>>>>>  };
> >>>>>>
> >>>>>>  /* A bitmap of OVS features that have been detected as
'supported'.
> > */
> >>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
> > feature)
> >>>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
> >>>>>>      case OVS_DP_METER_SUPPORT:
> >>>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
> >>>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
> >>>>>>          return true;
> >>>>>>      default:
> >>>>>>          return false;
> >>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
> >>>>>> *ovs_capabilities,
> >>>>>>      }
> >>>>>>
> >>>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
> >>>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
> >>>>>> -        const char *name = all_ovs_features[i].name;
> >>>>>> -        bool old_state = supported_ovs_features & value;
> >>>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
> > false);
> >>>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
> >>>>>> +        bool old_state = supported_ovs_features & feature->value;
> >>>>>> +        bool new_state = feature->parse(ovs_capabilities,
> >>> feature->name);
> >>>>>>          if (new_state != old_state) {
> >>>>>>              updated = true;
> >>>>>>              if (new_state) {
> >>>>>> -                supported_ovs_features |= value;
> >>>>>> +                supported_ovs_features |= feature->value;
> >>>>>>              } else {
> >>>>>> -                supported_ovs_features &= ~value;
> >>>>>> +                supported_ovs_features &= ~feature->value;
> >>>>>>              }
> >>>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
> >>>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
> >>> feature->name,
> >>>>>>                           new_state ? "supported" : "not
supported");
> >>>>>>          }
> >>>>>>      }
> >>>>>> --
> >>>>>> 2.31.1
> >>>>>>
> >>>>>> _______________________________________________
> >>>>>> dev mailing list
> >>>>>> dev@openvswitch.org
> >>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>>>>
> >>>>>>
> >>>>> Other than that it looks good.
> >>>>>
> >>>>> Acked-by: Ales Musil <amusil@redhat.com>
> >>>>>
> >>>>
> >>>> Thanks, I applied this to main.
> >>>>
> >>>> I was wondering if we should backport this though.  It's not exactly
a
> >>>> bug fix but it does avoid using a broken datapath feature (when
> >>>> possible) so it avoids a broken behavior.
> >>>>
> >>>> The changes are quite contained.
> >>>>
> >>>> CC-ing some of the maintainers explicitly to get some more eyes on
> > this.
> >>>>
> >>> Thanks Dumitru. Sorry that I was following the reported issue. I tried
> > to
> >>> look for some more details, but I am not authorized to access bug
> > 2175716.
> >>
> >> Sorry about that, that bug used to be public but some internal policies
> >> made it private.
> >>
> >>> It is not quite clear to me what the impact of this problem is. The
OVN
> >>> issue (112) seems to be a very common configuration of OVN ECMP and I
> >>> assume the problem is not easily reproduced because I have been
> >>> testing/deploying the feature and it worked as expected. Do you have
> > more
> >>> details about the trigger? Are there any related fixes in OVS and
> > kernel?
> >>>
> >>
> >> The original bug report came from OpenShift:
> >> https://issues.redhat.com/browse/OCPBUGS-7406
> >>
> >> CC-ing Patryk as he root caused the original issue.
> >>
> >> The simplified topology is:
> >>                               +--- GW1 ---+
> >> client ---- cluster-router ---+           +--- (out of OVN) --- server
> >>                               +--- GW2 ---+
> >>
> >> On cluster-router there's an ECMP route to reach 'server' via:
> >> - GW1
> >> - GW2
> >>
> >> On GW1 traffic going out of OVN is SNATed to IP1.
> >> On GW2 traffic going out of OVN is SNATed to IP2.
> >>
> >> The client initiates a TCP session towards the server; a path is chosen
> >> based on dp-hash, e.g. GW2.
> >>
> >> From this point on, during normal operation, traffic on this session
> >> flows fine via GW2.
> >>
> >> If the client closes the connection, then for the closing connection
> >> there's an exception when computing the dp-hash if the socket is in
> >> state TIME_WAIT, substate TCP_FIN_WAIT_2:
> >>
https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
> >> In this last case we call:
> >>
> >> tcp_v4_timewait_ack(sk, skb)
> >> |
> >> tcp_v4_send_ack(sk, skb, ...)
> >>
> >> The latter uses a global, per-cpu "ctl_sk" to build the last ACK
packet:
> >> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
> >>
> >>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
> >>         sock_net_set(ctl_sk, net);
> >>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
> >>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
> >>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
> >>                            inet_twsk(sk)->tw_priority :
sk->sk_priority;
> >>         transmit_time = tcp_transmit_time(sk);
> >>         ip_send_unicast_reply(ctl_sk,
> >>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
> >>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
> >>                               &arg, arg.iov[0].iov_len,
> >>                               transmit_time);
> >>
> >> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash
is
> >> computed from the packet contents:
> >>
> >
https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
> >>
> >> static inline __u32 skb_get_hash(struct sk_buff *skb)
> >> {
> >>         if (!skb->l4_hash && !skb->sw_hash)
> >>                 __skb_get_hash(skb);
> >>
> >>         return skb->hash;
> >> }
> >> [...]
> >> void __skb_get_hash(struct sk_buff *skb)
> >> {
> >>         struct flow_keys keys;
> >>         u32 hash;
> >>
> >>         __flow_hash_secret_init();
> >>
> >>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
> >>
> >>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
> >> }
> >>
> >> But the original socket's txhash was actually a random hash:
> >> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
> >>
> >> static inline void sk_set_txhash(struct sock *sk)
> >> {
> >>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
> >>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
> >> }
> >>
> >> This means that the last ack might enter the ovs datapath with a
> >> different dp_hash than all other packets sent on that session in that
> >> direction. It's possible that this last ack gets hashed to a different
> >> path of the ECMP route.
> >>
> >> This specific issue was fixed in the kernel through:
> >>
> >>
> >
https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
> >>
> >> However, while debugging we realized the problem is actually visible
> >> with _any_ retransmit.  That's because a retransmit will trigger a
rehash:
> >>
> >>
https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
> >>
> >> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
> >> {
> >>         const struct tcp_request_sock_ops *af_ops =
> > tcp_rsk(req)->af_specific;
> >>         struct flowi fl;
> >>         int res;
> >>
> >>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
> >>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
> >>                 tcp_rsk(req)->txhash = net_tx_rndhash();
> >>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
> > TCP_SYNACK_NORMAL,
> >>                                   NULL);
> >> [...]
> >>
> >> So suddenly all subsequent packets (skbs) will have a different 'hash'
> >> value and might get hashed to a different path.
> >>
> >> This last part was not fixed.  After a discussion on the netdev mailing
> >> list the conclusion was to not change the hash behavior for locally
> >> terminated TCP sessions:
> >>
> >>
> >
https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
> >>
> >> Eventually the documentation change patch was dropped and a v2 of the
> >> other fixes was accepted:
> >> https://www.spinics.net/lists/netdev/msg906252.html
> >>
> >> The rehash on retransmit problem is still present though making the
> >> kernel's dp-hash implementation unusable.
> >>
> >> During some internal discussions Aaron (in CC) pointed out that OVS
> >> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
> >> already reports the maximum supported hashing algorithm through the
> >> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
> >> that the kernel datapath didn't support L4 symmetric hashing.  But
Aaron
> >> posted a patch to implement that and the patch got accepted:
> >>
> >> https://www.spinics.net/lists/netdev/msg911045.html
> >>
> >> Then we moved OVN to use L4 symmetric hashing if available.  That's to
> >> ensure correctness of the packet processing.  We did run performance
> >> tests and the hit due to the extra hashing was acceptable, max 3%
> >> throughput degradation in OVN scenarios that simulate common traffic
> >> patterns (like ovn-kubernetes ones) - in most cases the performance
> >> degradation was around 2%.
> >>
> >> Sorry for the long story but I hope it provides all the required
> >> context.  Thinking more about it I should've probably added some of
this
> >> to the original commit log, I apologize.
> >>
> >> Regards,
> >> Dumitru
> >>
> >
> > Thanks Dumitru for the details! It is now quite clear and I am not
against
> > backporting it.
> > But I have a question regarding upgrading. For the existing traffic,
does
> > this mean the packets may choose a different path after OVN upgrading?
> > Should this be called out?
> >
>
> Hi Han,
>
> That's a good question.  They might and this affects all traffic that's
> not locally originated for the kernel datapath and (potentially) all
> traffic for the userspace datapath.  I can post a follow up patch to
> update the NEWS file.  Did you have anything else in mind?
>
Sounds good. I don't have anything else.

Thanks,
Han

> Regards,
> Dumitru
>
Dumitru Ceara Aug. 14, 2023, 1:57 p.m. UTC | #8
On 8/12/23 07:19, Han Zhou wrote:
> On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On 8/1/23 12:51, Han Zhou wrote:
>>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>>>
>>>> On 7/18/23 12:14, Han Zhou wrote:
>>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com>
> wrote:
>>>>>>
>>>>>> On 7/14/23 08:41, Ales Musil wrote:
>>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
>>> wrote:
>>>>>>>
>>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the
> netlink
>>>>>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use
> that
>>>>> one
>>>>>>>> instead.
>>>>>>>>
>>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>>>>>
>>>>>>>
>>>>>>> Hi Dumitru,
>>>>>>>
>>>>>>>
>>>>>>>> ---
>>>>>>>>  include/ovn/features.h |  2 ++
>>>>>>>>  lib/actions.c          |  6 ++++++
>>>>>>>>  lib/features.c         | 49
>>> +++++++++++++++++++++++++++++++++---------
>>>>>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>>>>>>>> index de8f1f5489..3bf536127f 100644
>>>>>>>> --- a/include/ovn/features.h
>>>>>>>> +++ b/include/ovn/features.h
>>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>>>>>>>      OVS_DP_METER_SUPPORT_BIT,
>>>>>>>>      OVS_CT_TUPLE_FLUSH_BIT,
>>>>>>>> +    OVS_DP_HASH_L4_SYM_BIT,
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  enum ovs_feature_value {
>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 <<
> OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>>>>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>>>>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>>>>>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  void ovs_feature_support_destroy(void);
>>>>>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>>>>>> index 037172e606..9d52cb75a8 100644
>>>>>>>> --- a/lib/actions.c
>>>>>>>> +++ b/lib/actions.c
>>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
>>>>> *select,
>>>>>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>>>>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>>>>>>>
>>>>>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>>>>>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
>>> of
>>>>>>>> +         * selection_method_param to 1: */
>>>>>>>>
>>>>>>>
>>>>>>> This comment is a bit unfortunate, because it reads like you want to
>>> set
>>>>>>> all bits of the upper half
>>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
>>>>> value
>>>>>>> 1 (1 << 32)." during merge, wdyt?
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Makes sense, thanks for the suggestion!  I changed it accordingly.
>>>>>>
>>>>>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>>      struct mf_subfield sf =
> expr_resolve_field(&select->res_field);
>>>>>>>>
>>>>>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
>>>>> bucket_id++) {
>>>>>>>> diff --git a/lib/features.c b/lib/features.c
>>>>>>>> index 97c9c86f00..d24e8f6c5c 100644
>>>>>>>> --- a/lib/features.c
>>>>>>>> +++ b/lib/features.c
>>>>>>>> @@ -21,6 +21,7 @@
>>>>>>>>  #include "lib/dirs.h"
>>>>>>>>  #include "socket-util.h"
>>>>>>>>  #include "lib/vswitch-idl.h"
>>>>>>>> +#include "odp-netlink.h"
>>>>>>>>  #include "openvswitch/vlog.h"
>>>>>>>>  #include "openvswitch/ofpbuf.h"
>>>>>>>>  #include "openvswitch/rconn.h"
>>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>>>>>>>
>>>>>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>>>>>>>
>>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether
> the
>>>>>>>> + * type of capability is supported or not. */
>>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap
>>>>> *ovs_capabilities,
>>>>>>>> +                                    const char *cap_name);
>>>>>>>> +
>>>>>>>>  struct ovs_feature {
>>>>>>>>      enum ovs_feature_value value;
>>>>>>>>      const char *name;
>>>>>>>> +    ovs_feature_parse_func *parse;
>>>>>>>>  };
>>>>>>>>
>>>>>>>> +static bool
>>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char
>>> *cap_name)
>>>>>>>> +{
>>>>>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static bool
>>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>>>>>>>> +                              const char *cap_name OVS_UNUSED)
>>>>>>>> +{
>>>>>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
>>> "max_hash_alg",
>>>>> 0);
>>>>>>>> +
>>>>>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static struct ovs_feature all_ovs_features[] = {
>>>>>>>>      {
>>>>>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>>>>>>>> -        .name = "ct_zero_snat"
>>>>>>>> +        .name = "ct_zero_snat",
>>>>>>>> +        .parse = bool_parser,
>>>>>>>>      },
>>>>>>>>      {
>>>>>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>>>>>>>> -        .name = "ct_flush"
>>>>>>>> -    }
>>>>>>>> +        .name = "ct_flush",
>>>>>>>> +        .parse = bool_parser,
>>>>>>>> +    },
>>>>>>>> +    {
>>>>>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>>>>>>>> +        .name = "dp_hash_l4_sym_support",
>>>>>>>> +        .parse = dp_hash_l4_sym_support_parser,
>>>>>>>> +    },
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  /* A bitmap of OVS features that have been detected as
> 'supported'.
>>> */
>>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
>>> feature)
>>>>>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>>>>>>>      case OVS_DP_METER_SUPPORT:
>>>>>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>>>>>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>>>>>>>          return true;
>>>>>>>>      default:
>>>>>>>>          return false;
>>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>>>>>>>> *ovs_capabilities,
>>>>>>>>      }
>>>>>>>>
>>>>>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>>>>>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>>>>>>>> -        const char *name = all_ovs_features[i].name;
>>>>>>>> -        bool old_state = supported_ovs_features & value;
>>>>>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
>>> false);
>>>>>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
>>>>>>>> +        bool old_state = supported_ovs_features & feature->value;
>>>>>>>> +        bool new_state = feature->parse(ovs_capabilities,
>>>>> feature->name);
>>>>>>>>          if (new_state != old_state) {
>>>>>>>>              updated = true;
>>>>>>>>              if (new_state) {
>>>>>>>> -                supported_ovs_features |= value;
>>>>>>>> +                supported_ovs_features |= feature->value;
>>>>>>>>              } else {
>>>>>>>> -                supported_ovs_features &= ~value;
>>>>>>>> +                supported_ovs_features &= ~feature->value;
>>>>>>>>              }
>>>>>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>>>>>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
>>>>> feature->name,
>>>>>>>>                           new_state ? "supported" : "not
> supported");
>>>>>>>>          }
>>>>>>>>      }
>>>>>>>> --
>>>>>>>> 2.31.1
>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> dev mailing list
>>>>>>>> dev@openvswitch.org
>>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>>>>
>>>>>>>>
>>>>>>> Other than that it looks good.
>>>>>>>
>>>>>>> Acked-by: Ales Musil <amusil@redhat.com>
>>>>>>>
>>>>>>
>>>>>> Thanks, I applied this to main.
>>>>>>
>>>>>> I was wondering if we should backport this though.  It's not exactly
> a
>>>>>> bug fix but it does avoid using a broken datapath feature (when
>>>>>> possible) so it avoids a broken behavior.
>>>>>>
>>>>>> The changes are quite contained.
>>>>>>
>>>>>> CC-ing some of the maintainers explicitly to get some more eyes on
>>> this.
>>>>>>
>>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried
>>> to
>>>>> look for some more details, but I am not authorized to access bug
>>> 2175716.
>>>>
>>>> Sorry about that, that bug used to be public but some internal policies
>>>> made it private.
>>>>
>>>>> It is not quite clear to me what the impact of this problem is. The
> OVN
>>>>> issue (112) seems to be a very common configuration of OVN ECMP and I
>>>>> assume the problem is not easily reproduced because I have been
>>>>> testing/deploying the feature and it worked as expected. Do you have
>>> more
>>>>> details about the trigger? Are there any related fixes in OVS and
>>> kernel?
>>>>>
>>>>
>>>> The original bug report came from OpenShift:
>>>> https://issues.redhat.com/browse/OCPBUGS-7406
>>>>
>>>> CC-ing Patryk as he root caused the original issue.
>>>>
>>>> The simplified topology is:
>>>>                               +--- GW1 ---+
>>>> client ---- cluster-router ---+           +--- (out of OVN) --- server
>>>>                               +--- GW2 ---+
>>>>
>>>> On cluster-router there's an ECMP route to reach 'server' via:
>>>> - GW1
>>>> - GW2
>>>>
>>>> On GW1 traffic going out of OVN is SNATed to IP1.
>>>> On GW2 traffic going out of OVN is SNATed to IP2.
>>>>
>>>> The client initiates a TCP session towards the server; a path is chosen
>>>> based on dp-hash, e.g. GW2.
>>>>
>>>> From this point on, during normal operation, traffic on this session
>>>> flows fine via GW2.
>>>>
>>>> If the client closes the connection, then for the closing connection
>>>> there's an exception when computing the dp-hash if the socket is in
>>>> state TIME_WAIT, substate TCP_FIN_WAIT_2:
>>>>
> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
>>>> In this last case we call:
>>>>
>>>> tcp_v4_timewait_ack(sk, skb)
>>>> |
>>>> tcp_v4_send_ack(sk, skb, ...)
>>>>
>>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK
> packet:
>>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
>>>>
>>>>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
>>>>         sock_net_set(ctl_sk, net);
>>>>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
>>>>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>                            inet_twsk(sk)->tw_priority :
> sk->sk_priority;
>>>>         transmit_time = tcp_transmit_time(sk);
>>>>         ip_send_unicast_reply(ctl_sk,
>>>>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
>>>>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>>>>                               &arg, arg.iov[0].iov_len,
>>>>                               transmit_time);
>>>>
>>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash
> is
>>>> computed from the packet contents:
>>>>
>>>
> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
>>>>
>>>> static inline __u32 skb_get_hash(struct sk_buff *skb)
>>>> {
>>>>         if (!skb->l4_hash && !skb->sw_hash)
>>>>                 __skb_get_hash(skb);
>>>>
>>>>         return skb->hash;
>>>> }
>>>> [...]
>>>> void __skb_get_hash(struct sk_buff *skb)
>>>> {
>>>>         struct flow_keys keys;
>>>>         u32 hash;
>>>>
>>>>         __flow_hash_secret_init();
>>>>
>>>>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
>>>>
>>>>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
>>>> }
>>>>
>>>> But the original socket's txhash was actually a random hash:
>>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
>>>>
>>>> static inline void sk_set_txhash(struct sock *sk)
>>>> {
>>>>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
>>>>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
>>>> }
>>>>
>>>> This means that the last ack might enter the ovs datapath with a
>>>> different dp_hash than all other packets sent on that session in that
>>>> direction. It's possible that this last ack gets hashed to a different
>>>> path of the ECMP route.
>>>>
>>>> This specific issue was fixed in the kernel through:
>>>>
>>>>
>>>
> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
>>>>
>>>> However, while debugging we realized the problem is actually visible
>>>> with _any_ retransmit.  That's because a retransmit will trigger a
> rehash:
>>>>
>>>>
> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
>>>>
>>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
>>>> {
>>>>         const struct tcp_request_sock_ops *af_ops =
>>> tcp_rsk(req)->af_specific;
>>>>         struct flowi fl;
>>>>         int res;
>>>>
>>>>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
>>>>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
>>>>                 tcp_rsk(req)->txhash = net_tx_rndhash();
>>>>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
>>> TCP_SYNACK_NORMAL,
>>>>                                   NULL);
>>>> [...]
>>>>
>>>> So suddenly all subsequent packets (skbs) will have a different 'hash'
>>>> value and might get hashed to a different path.
>>>>
>>>> This last part was not fixed.  After a discussion on the netdev mailing
>>>> list the conclusion was to not change the hash behavior for locally
>>>> terminated TCP sessions:
>>>>
>>>>
>>>
> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
>>>>
>>>> Eventually the documentation change patch was dropped and a v2 of the
>>>> other fixes was accepted:
>>>> https://www.spinics.net/lists/netdev/msg906252.html
>>>>
>>>> The rehash on retransmit problem is still present though making the
>>>> kernel's dp-hash implementation unusable.
>>>>
>>>> During some internal discussions Aaron (in CC) pointed out that OVS
>>>> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
>>>> already reports the maximum supported hashing algorithm through the
>>>> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
>>>> that the kernel datapath didn't support L4 symmetric hashing.  But
> Aaron
>>>> posted a patch to implement that and the patch got accepted:
>>>>
>>>> https://www.spinics.net/lists/netdev/msg911045.html
>>>>
>>>> Then we moved OVN to use L4 symmetric hashing if available.  That's to
>>>> ensure correctness of the packet processing.  We did run performance
>>>> tests and the hit due to the extra hashing was acceptable, max 3%
>>>> throughput degradation in OVN scenarios that simulate common traffic
>>>> patterns (like ovn-kubernetes ones) - in most cases the performance
>>>> degradation was around 2%.
>>>>
>>>> Sorry for the long story but I hope it provides all the required
>>>> context.  Thinking more about it I should've probably added some of
> this
>>>> to the original commit log, I apologize.
>>>>
>>>> Regards,
>>>> Dumitru
>>>>
>>>
>>> Thanks Dumitru for the details! It is now quite clear and I am not
> against
>>> backporting it.
>>> But I have a question regarding upgrading. For the existing traffic,
> does
>>> this mean the packets may choose a different path after OVN upgrading?
>>> Should this be called out?
>>>
>>
>> Hi Han,
>>
>> That's a good question.  They might and this affects all traffic that's
>> not locally originated for the kernel datapath and (potentially) all
>> traffic for the userspace datapath.  I can post a follow up patch to
>> update the NEWS file.  Did you have anything else in mind?
>>
> Sounds good. I don't have anything else.
> 

I posted
https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/

Thanks,
Dumitru
Dumitru Ceara Aug. 15, 2023, 10:39 a.m. UTC | #9
On 8/14/23 15:57, Dumitru Ceara wrote:
> On 8/12/23 07:19, Han Zhou wrote:
>> On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>>
>>> On 8/1/23 12:51, Han Zhou wrote:
>>>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>>>>
>>>>> On 7/18/23 12:14, Han Zhou wrote:
>>>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com>
>> wrote:
>>>>>>>
>>>>>>> On 7/14/23 08:41, Ales Musil wrote:
>>>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
>>>> wrote:
>>>>>>>>
>>>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the
>> netlink
>>>>>>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use
>> that
>>>>>> one
>>>>>>>>> instead.
>>>>>>>>>
>>>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>>>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>>>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Dumitru,
>>>>>>>>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>  include/ovn/features.h |  2 ++
>>>>>>>>>  lib/actions.c          |  6 ++++++
>>>>>>>>>  lib/features.c         | 49
>>>> +++++++++++++++++++++++++++++++++---------
>>>>>>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>>>>>>>>> index de8f1f5489..3bf536127f 100644
>>>>>>>>> --- a/include/ovn/features.h
>>>>>>>>> +++ b/include/ovn/features.h
>>>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>>>>>>>>      OVS_DP_METER_SUPPORT_BIT,
>>>>>>>>>      OVS_CT_TUPLE_FLUSH_BIT,
>>>>>>>>> +    OVS_DP_HASH_L4_SYM_BIT,
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>>  enum ovs_feature_value {
>>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 <<
>> OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>>>>>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>>>>>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>>>>>>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>>  void ovs_feature_support_destroy(void);
>>>>>>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>>>>>>> index 037172e606..9d52cb75a8 100644
>>>>>>>>> --- a/lib/actions.c
>>>>>>>>> +++ b/lib/actions.c
>>>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
>>>>>> *select,
>>>>>>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>>>>>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>>>>>>>>
>>>>>>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>>>>>>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
>>>> of
>>>>>>>>> +         * selection_method_param to 1: */
>>>>>>>>>
>>>>>>>>
>>>>>>>> This comment is a bit unfortunate, because it reads like you want to
>>>> set
>>>>>>>> all bits of the upper half
>>>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
>>>>>> value
>>>>>>>> 1 (1 << 32)." during merge, wdyt?
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Makes sense, thanks for the suggestion!  I changed it accordingly.
>>>>>>>
>>>>>>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>>      struct mf_subfield sf =
>> expr_resolve_field(&select->res_field);
>>>>>>>>>
>>>>>>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
>>>>>> bucket_id++) {
>>>>>>>>> diff --git a/lib/features.c b/lib/features.c
>>>>>>>>> index 97c9c86f00..d24e8f6c5c 100644
>>>>>>>>> --- a/lib/features.c
>>>>>>>>> +++ b/lib/features.c
>>>>>>>>> @@ -21,6 +21,7 @@
>>>>>>>>>  #include "lib/dirs.h"
>>>>>>>>>  #include "socket-util.h"
>>>>>>>>>  #include "lib/vswitch-idl.h"
>>>>>>>>> +#include "odp-netlink.h"
>>>>>>>>>  #include "openvswitch/vlog.h"
>>>>>>>>>  #include "openvswitch/ofpbuf.h"
>>>>>>>>>  #include "openvswitch/rconn.h"
>>>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>>>>>>>>
>>>>>>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>>>>>>>>
>>>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether
>> the
>>>>>>>>> + * type of capability is supported or not. */
>>>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap
>>>>>> *ovs_capabilities,
>>>>>>>>> +                                    const char *cap_name);
>>>>>>>>> +
>>>>>>>>>  struct ovs_feature {
>>>>>>>>>      enum ovs_feature_value value;
>>>>>>>>>      const char *name;
>>>>>>>>> +    ovs_feature_parse_func *parse;
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>> +static bool
>>>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char
>>>> *cap_name)
>>>>>>>>> +{
>>>>>>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static bool
>>>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>>>>>>>>> +                              const char *cap_name OVS_UNUSED)
>>>>>>>>> +{
>>>>>>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
>>>> "max_hash_alg",
>>>>>> 0);
>>>>>>>>> +
>>>>>>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>  static struct ovs_feature all_ovs_features[] = {
>>>>>>>>>      {
>>>>>>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>>>>>>>>> -        .name = "ct_zero_snat"
>>>>>>>>> +        .name = "ct_zero_snat",
>>>>>>>>> +        .parse = bool_parser,
>>>>>>>>>      },
>>>>>>>>>      {
>>>>>>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>>>>>>>>> -        .name = "ct_flush"
>>>>>>>>> -    }
>>>>>>>>> +        .name = "ct_flush",
>>>>>>>>> +        .parse = bool_parser,
>>>>>>>>> +    },
>>>>>>>>> +    {
>>>>>>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>>>>>>>>> +        .name = "dp_hash_l4_sym_support",
>>>>>>>>> +        .parse = dp_hash_l4_sym_support_parser,
>>>>>>>>> +    },
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>>  /* A bitmap of OVS features that have been detected as
>> 'supported'.
>>>> */
>>>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
>>>> feature)
>>>>>>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>>>>>>>>      case OVS_DP_METER_SUPPORT:
>>>>>>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>>>>>>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>>>>>>>>          return true;
>>>>>>>>>      default:
>>>>>>>>>          return false;
>>>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>>>>>>>>> *ovs_capabilities,
>>>>>>>>>      }
>>>>>>>>>
>>>>>>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>>>>>>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>>>>>>>>> -        const char *name = all_ovs_features[i].name;
>>>>>>>>> -        bool old_state = supported_ovs_features & value;
>>>>>>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
>>>> false);
>>>>>>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
>>>>>>>>> +        bool old_state = supported_ovs_features & feature->value;
>>>>>>>>> +        bool new_state = feature->parse(ovs_capabilities,
>>>>>> feature->name);
>>>>>>>>>          if (new_state != old_state) {
>>>>>>>>>              updated = true;
>>>>>>>>>              if (new_state) {
>>>>>>>>> -                supported_ovs_features |= value;
>>>>>>>>> +                supported_ovs_features |= feature->value;
>>>>>>>>>              } else {
>>>>>>>>> -                supported_ovs_features &= ~value;
>>>>>>>>> +                supported_ovs_features &= ~feature->value;
>>>>>>>>>              }
>>>>>>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>>>>>>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
>>>>>> feature->name,
>>>>>>>>>                           new_state ? "supported" : "not
>> supported");
>>>>>>>>>          }
>>>>>>>>>      }
>>>>>>>>> --
>>>>>>>>> 2.31.1
>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> dev mailing list
>>>>>>>>> dev@openvswitch.org
>>>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>>>>>
>>>>>>>>>
>>>>>>>> Other than that it looks good.
>>>>>>>>
>>>>>>>> Acked-by: Ales Musil <amusil@redhat.com>
>>>>>>>>
>>>>>>>
>>>>>>> Thanks, I applied this to main.
>>>>>>>
>>>>>>> I was wondering if we should backport this though.  It's not exactly
>> a
>>>>>>> bug fix but it does avoid using a broken datapath feature (when
>>>>>>> possible) so it avoids a broken behavior.
>>>>>>>
>>>>>>> The changes are quite contained.
>>>>>>>
>>>>>>> CC-ing some of the maintainers explicitly to get some more eyes on
>>>> this.
>>>>>>>
>>>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried
>>>> to
>>>>>> look for some more details, but I am not authorized to access bug
>>>> 2175716.
>>>>>
>>>>> Sorry about that, that bug used to be public but some internal policies
>>>>> made it private.
>>>>>
>>>>>> It is not quite clear to me what the impact of this problem is. The
>> OVN
>>>>>> issue (112) seems to be a very common configuration of OVN ECMP and I
>>>>>> assume the problem is not easily reproduced because I have been
>>>>>> testing/deploying the feature and it worked as expected. Do you have
>>>> more
>>>>>> details about the trigger? Are there any related fixes in OVS and
>>>> kernel?
>>>>>>
>>>>>
>>>>> The original bug report came from OpenShift:
>>>>> https://issues.redhat.com/browse/OCPBUGS-7406
>>>>>
>>>>> CC-ing Patryk as he root caused the original issue.
>>>>>
>>>>> The simplified topology is:
>>>>>                               +--- GW1 ---+
>>>>> client ---- cluster-router ---+           +--- (out of OVN) --- server
>>>>>                               +--- GW2 ---+
>>>>>
>>>>> On cluster-router there's an ECMP route to reach 'server' via:
>>>>> - GW1
>>>>> - GW2
>>>>>
>>>>> On GW1 traffic going out of OVN is SNATed to IP1.
>>>>> On GW2 traffic going out of OVN is SNATed to IP2.
>>>>>
>>>>> The client initiates a TCP session towards the server; a path is chosen
>>>>> based on dp-hash, e.g. GW2.
>>>>>
>>>>> From this point on, during normal operation, traffic on this session
>>>>> flows fine via GW2.
>>>>>
>>>>> If the client closes the connection, then for the closing connection
>>>>> there's an exception when computing the dp-hash if the socket is in
>>>>> state TIME_WAIT, substate TCP_FIN_WAIT_2:
>>>>>
>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
>>>>> In this last case we call:
>>>>>
>>>>> tcp_v4_timewait_ack(sk, skb)
>>>>> |
>>>>> tcp_v4_send_ack(sk, skb, ...)
>>>>>
>>>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK
>> packet:
>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
>>>>>
>>>>>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
>>>>>         sock_net_set(ctl_sk, net);
>>>>>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
>>>>>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>>                            inet_twsk(sk)->tw_priority :
>> sk->sk_priority;
>>>>>         transmit_time = tcp_transmit_time(sk);
>>>>>         ip_send_unicast_reply(ctl_sk,
>>>>>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
>>>>>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>>>>>                               &arg, arg.iov[0].iov_len,
>>>>>                               transmit_time);
>>>>>
>>>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash
>> is
>>>>> computed from the packet contents:
>>>>>
>>>>
>> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
>>>>>
>>>>> static inline __u32 skb_get_hash(struct sk_buff *skb)
>>>>> {
>>>>>         if (!skb->l4_hash && !skb->sw_hash)
>>>>>                 __skb_get_hash(skb);
>>>>>
>>>>>         return skb->hash;
>>>>> }
>>>>> [...]
>>>>> void __skb_get_hash(struct sk_buff *skb)
>>>>> {
>>>>>         struct flow_keys keys;
>>>>>         u32 hash;
>>>>>
>>>>>         __flow_hash_secret_init();
>>>>>
>>>>>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
>>>>>
>>>>>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
>>>>> }
>>>>>
>>>>> But the original socket's txhash was actually a random hash:
>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
>>>>>
>>>>> static inline void sk_set_txhash(struct sock *sk)
>>>>> {
>>>>>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
>>>>>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
>>>>> }
>>>>>
>>>>> This means that the last ack might enter the ovs datapath with a
>>>>> different dp_hash than all other packets sent on that session in that
>>>>> direction. It's possible that this last ack gets hashed to a different
>>>>> path of the ECMP route.
>>>>>
>>>>> This specific issue was fixed in the kernel through:
>>>>>
>>>>>
>>>>
>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
>>>>>
>>>>> However, while debugging we realized the problem is actually visible
>>>>> with _any_ retransmit.  That's because a retransmit will trigger a
>> rehash:
>>>>>
>>>>>
>> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
>>>>>
>>>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
>>>>> {
>>>>>         const struct tcp_request_sock_ops *af_ops =
>>>> tcp_rsk(req)->af_specific;
>>>>>         struct flowi fl;
>>>>>         int res;
>>>>>
>>>>>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
>>>>>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
>>>>>                 tcp_rsk(req)->txhash = net_tx_rndhash();
>>>>>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
>>>> TCP_SYNACK_NORMAL,
>>>>>                                   NULL);
>>>>> [...]
>>>>>
>>>>> So suddenly all subsequent packets (skbs) will have a different 'hash'
>>>>> value and might get hashed to a different path.
>>>>>
>>>>> This last part was not fixed.  After a discussion on the netdev mailing
>>>>> list the conclusion was to not change the hash behavior for locally
>>>>> terminated TCP sessions:
>>>>>
>>>>>
>>>>
>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
>>>>>
>>>>> Eventually the documentation change patch was dropped and a v2 of the
>>>>> other fixes was accepted:
>>>>> https://www.spinics.net/lists/netdev/msg906252.html
>>>>>
>>>>> The rehash on retransmit problem is still present though making the
>>>>> kernel's dp-hash implementation unusable.
>>>>>
>>>>> During some internal discussions Aaron (in CC) pointed out that OVS
>>>>> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
>>>>> already reports the maximum supported hashing algorithm through the
>>>>> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
>>>>> that the kernel datapath didn't support L4 symmetric hashing.  But
>> Aaron
>>>>> posted a patch to implement that and the patch got accepted:
>>>>>
>>>>> https://www.spinics.net/lists/netdev/msg911045.html
>>>>>
>>>>> Then we moved OVN to use L4 symmetric hashing if available.  That's to
>>>>> ensure correctness of the packet processing.  We did run performance
>>>>> tests and the hit due to the extra hashing was acceptable, max 3%
>>>>> throughput degradation in OVN scenarios that simulate common traffic
>>>>> patterns (like ovn-kubernetes ones) - in most cases the performance
>>>>> degradation was around 2%.
>>>>>
>>>>> Sorry for the long story but I hope it provides all the required
>>>>> context.  Thinking more about it I should've probably added some of
>> this
>>>>> to the original commit log, I apologize.
>>>>>
>>>>> Regards,
>>>>> Dumitru
>>>>>
>>>>
>>>> Thanks Dumitru for the details! It is now quite clear and I am not
>> against
>>>> backporting it.
>>>> But I have a question regarding upgrading. For the existing traffic,
>> does
>>>> this mean the packets may choose a different path after OVN upgrading?
>>>> Should this be called out?
>>>>
>>>
>>> Hi Han,
>>>
>>> That's a good question.  They might and this affects all traffic that's
>>> not locally originated for the kernel datapath and (potentially) all
>>> traffic for the userspace datapath.  I can post a follow up patch to
>>> update the NEWS file.  Did you have anything else in mind?
>>>
>> Sounds good. I don't have anything else.
>>
> 
> I posted
> https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/
> 

I backported the commit and the NEWS update to branches 23.06 and 23.03.
 It didn't apply cleanly on older branches but I think that's fine for
now.  We can backport it further later if users request it.

Regards,
Dumitru
Dumitru Ceara Sept. 27, 2023, 11:43 a.m. UTC | #10
On 8/15/23 12:39, Dumitru Ceara wrote:
> On 8/14/23 15:57, Dumitru Ceara wrote:
>> On 8/12/23 07:19, Han Zhou wrote:
>>> On Fri, Aug 11, 2023 at 6:23 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>>>
>>>> On 8/1/23 12:51, Han Zhou wrote:
>>>>> On Tue, Jul 18, 2023 at 6:16 AM Dumitru Ceara <dceara@redhat.com> wrote:
>>>>>>
>>>>>> On 7/18/23 12:14, Han Zhou wrote:
>>>>>>> On Mon, Jul 17, 2023 at 9:51 PM Dumitru Ceara <dceara@redhat.com>
>>> wrote:
>>>>>>>>
>>>>>>>> On 7/14/23 08:41, Ales Musil wrote:
>>>>>>>>> On Thu, Jul 13, 2023 at 4:39 PM Dumitru Ceara <dceara@redhat.com>
>>>>> wrote:
>>>>>>>>>
>>>>>>>>>> Regular dp-hash is not a canonical L4 hash (at least with the
>>> netlink
>>>>>>>>>> datapath).  If the datapath supports l4 symmetrical dp-hash use
>>> that
>>>>>>> one
>>>>>>>>>> instead.
>>>>>>>>>>
>>>>>>>>>> Reported-at: https://github.com/ovn-org/ovn/issues/112
>>>>>>>>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2188679
>>>>>>>>>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Dumitru,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>>  include/ovn/features.h |  2 ++
>>>>>>>>>>  lib/actions.c          |  6 ++++++
>>>>>>>>>>  lib/features.c         | 49
>>>>> +++++++++++++++++++++++++++++++++---------
>>>>>>>>>>  3 files changed, 47 insertions(+), 10 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/include/ovn/features.h b/include/ovn/features.h
>>>>>>>>>> index de8f1f5489..3bf536127f 100644
>>>>>>>>>> --- a/include/ovn/features.h
>>>>>>>>>> +++ b/include/ovn/features.h
>>>>>>>>>> @@ -34,12 +34,14 @@ enum ovs_feature_support_bits {
>>>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT_BIT,
>>>>>>>>>>      OVS_DP_METER_SUPPORT_BIT,
>>>>>>>>>>      OVS_CT_TUPLE_FLUSH_BIT,
>>>>>>>>>> +    OVS_DP_HASH_L4_SYM_BIT,
>>>>>>>>>>  };
>>>>>>>>>>
>>>>>>>>>>  enum ovs_feature_value {
>>>>>>>>>>      OVS_CT_ZERO_SNAT_SUPPORT = (1 <<
>>> OVS_CT_ZERO_SNAT_SUPPORT_BIT),
>>>>>>>>>>      OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
>>>>>>>>>>      OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
>>>>>>>>>> +    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
>>>>>>>>>>  };
>>>>>>>>>>
>>>>>>>>>>  void ovs_feature_support_destroy(void);
>>>>>>>>>> diff --git a/lib/actions.c b/lib/actions.c
>>>>>>>>>> index 037172e606..9d52cb75a8 100644
>>>>>>>>>> --- a/lib/actions.c
>>>>>>>>>> +++ b/lib/actions.c
>>>>>>>>>> @@ -1625,6 +1625,12 @@ encode_SELECT(const struct ovnact_select
>>>>>>> *select,
>>>>>>>>>>      struct ds ds = DS_EMPTY_INITIALIZER;
>>>>>>>>>>      ds_put_format(&ds, "type=select,selection_method=dp_hash");
>>>>>>>>>>
>>>>>>>>>> +    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
>>>>>>>>>> +        /* Select dp-hash l4_symmetric by setting the upper 32bits
>>>>> of
>>>>>>>>>> +         * selection_method_param to 1: */
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> This comment is a bit unfortunate, because it reads like you want to
>>>>> set
>>>>>>>>> all bits of the upper half
>>>>>>>>> to 1 e.g. 0xffffffff. Maybe change it to: "selection_method_param to
>>>>>>> value
>>>>>>>>> 1 (1 << 32)." during merge, wdyt?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>> Makes sense, thanks for the suggestion!  I changed it accordingly.
>>>>>>>>
>>>>>>>>>> +        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>>      struct mf_subfield sf =
>>> expr_resolve_field(&select->res_field);
>>>>>>>>>>
>>>>>>>>>>      for (size_t bucket_id = 0; bucket_id < select->n_dsts;
>>>>>>> bucket_id++) {
>>>>>>>>>> diff --git a/lib/features.c b/lib/features.c
>>>>>>>>>> index 97c9c86f00..d24e8f6c5c 100644
>>>>>>>>>> --- a/lib/features.c
>>>>>>>>>> +++ b/lib/features.c
>>>>>>>>>> @@ -21,6 +21,7 @@
>>>>>>>>>>  #include "lib/dirs.h"
>>>>>>>>>>  #include "socket-util.h"
>>>>>>>>>>  #include "lib/vswitch-idl.h"
>>>>>>>>>> +#include "odp-netlink.h"
>>>>>>>>>>  #include "openvswitch/vlog.h"
>>>>>>>>>>  #include "openvswitch/ofpbuf.h"
>>>>>>>>>>  #include "openvswitch/rconn.h"
>>>>>>>>>> @@ -33,20 +34,48 @@ VLOG_DEFINE_THIS_MODULE(features);
>>>>>>>>>>
>>>>>>>>>>  #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>>>>>>>>>>
>>>>>>>>>> +/* Parses 'cap_name' from 'ovs_capabilities' and returns whether
>>> the
>>>>>>>>>> + * type of capability is supported or not. */
>>>>>>>>>> +typedef bool ovs_feature_parse_func(const struct smap
>>>>>>> *ovs_capabilities,
>>>>>>>>>> +                                    const char *cap_name);
>>>>>>>>>> +
>>>>>>>>>>  struct ovs_feature {
>>>>>>>>>>      enum ovs_feature_value value;
>>>>>>>>>>      const char *name;
>>>>>>>>>> +    ovs_feature_parse_func *parse;
>>>>>>>>>>  };
>>>>>>>>>>
>>>>>>>>>> +static bool
>>>>>>>>>> +bool_parser(const struct smap *ovs_capabilities, const char
>>>>> *cap_name)
>>>>>>>>>> +{
>>>>>>>>>> +    return smap_get_bool(ovs_capabilities, cap_name, false);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static bool
>>>>>>>>>> +dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
>>>>>>>>>> +                              const char *cap_name OVS_UNUSED)
>>>>>>>>>> +{
>>>>>>>>>> +    int max_hash_alg = smap_get_int(ovs_capabilities,
>>>>> "max_hash_alg",
>>>>>>> 0);
>>>>>>>>>> +
>>>>>>>>>> +    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>  static struct ovs_feature all_ovs_features[] = {
>>>>>>>>>>      {
>>>>>>>>>>          .value = OVS_CT_ZERO_SNAT_SUPPORT,
>>>>>>>>>> -        .name = "ct_zero_snat"
>>>>>>>>>> +        .name = "ct_zero_snat",
>>>>>>>>>> +        .parse = bool_parser,
>>>>>>>>>>      },
>>>>>>>>>>      {
>>>>>>>>>>          .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
>>>>>>>>>> -        .name = "ct_flush"
>>>>>>>>>> -    }
>>>>>>>>>> +        .name = "ct_flush",
>>>>>>>>>> +        .parse = bool_parser,
>>>>>>>>>> +    },
>>>>>>>>>> +    {
>>>>>>>>>> +        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
>>>>>>>>>> +        .name = "dp_hash_l4_sym_support",
>>>>>>>>>> +        .parse = dp_hash_l4_sym_support_parser,
>>>>>>>>>> +    },
>>>>>>>>>>  };
>>>>>>>>>>
>>>>>>>>>>  /* A bitmap of OVS features that have been detected as
>>> 'supported'.
>>>>> */
>>>>>>>>>> @@ -65,6 +94,7 @@ ovs_feature_is_valid(enum ovs_feature_value
>>>>> feature)
>>>>>>>>>>      case OVS_CT_ZERO_SNAT_SUPPORT:
>>>>>>>>>>      case OVS_DP_METER_SUPPORT:
>>>>>>>>>>      case OVS_CT_TUPLE_FLUSH_SUPPORT:
>>>>>>>>>> +    case OVS_DP_HASH_L4_SYM_SUPPORT:
>>>>>>>>>>          return true;
>>>>>>>>>>      default:
>>>>>>>>>>          return false;
>>>>>>>>>> @@ -183,18 +213,17 @@ ovs_feature_support_run(const struct smap
>>>>>>>>>> *ovs_capabilities,
>>>>>>>>>>      }
>>>>>>>>>>
>>>>>>>>>>      for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
>>>>>>>>>> -        enum ovs_feature_value value = all_ovs_features[i].value;
>>>>>>>>>> -        const char *name = all_ovs_features[i].name;
>>>>>>>>>> -        bool old_state = supported_ovs_features & value;
>>>>>>>>>> -        bool new_state = smap_get_bool(ovs_capabilities, name,
>>>>> false);
>>>>>>>>>> +        struct ovs_feature *feature = &all_ovs_features[i];
>>>>>>>>>> +        bool old_state = supported_ovs_features & feature->value;
>>>>>>>>>> +        bool new_state = feature->parse(ovs_capabilities,
>>>>>>> feature->name);
>>>>>>>>>>          if (new_state != old_state) {
>>>>>>>>>>              updated = true;
>>>>>>>>>>              if (new_state) {
>>>>>>>>>> -                supported_ovs_features |= value;
>>>>>>>>>> +                supported_ovs_features |= feature->value;
>>>>>>>>>>              } else {
>>>>>>>>>> -                supported_ovs_features &= ~value;
>>>>>>>>>> +                supported_ovs_features &= ~feature->value;
>>>>>>>>>>              }
>>>>>>>>>> -            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
>>>>>>>>>> +            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s",
>>>>>>> feature->name,
>>>>>>>>>>                           new_state ? "supported" : "not
>>> supported");
>>>>>>>>>>          }
>>>>>>>>>>      }
>>>>>>>>>> --
>>>>>>>>>> 2.31.1
>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> dev mailing list
>>>>>>>>>> dev@openvswitch.org
>>>>>>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>> Other than that it looks good.
>>>>>>>>>
>>>>>>>>> Acked-by: Ales Musil <amusil@redhat.com>
>>>>>>>>>
>>>>>>>>
>>>>>>>> Thanks, I applied this to main.
>>>>>>>>
>>>>>>>> I was wondering if we should backport this though.  It's not exactly
>>> a
>>>>>>>> bug fix but it does avoid using a broken datapath feature (when
>>>>>>>> possible) so it avoids a broken behavior.
>>>>>>>>
>>>>>>>> The changes are quite contained.
>>>>>>>>
>>>>>>>> CC-ing some of the maintainers explicitly to get some more eyes on
>>>>> this.
>>>>>>>>
>>>>>>> Thanks Dumitru. Sorry that I was following the reported issue. I tried
>>>>> to
>>>>>>> look for some more details, but I am not authorized to access bug
>>>>> 2175716.
>>>>>>
>>>>>> Sorry about that, that bug used to be public but some internal policies
>>>>>> made it private.
>>>>>>
>>>>>>> It is not quite clear to me what the impact of this problem is. The
>>> OVN
>>>>>>> issue (112) seems to be a very common configuration of OVN ECMP and I
>>>>>>> assume the problem is not easily reproduced because I have been
>>>>>>> testing/deploying the feature and it worked as expected. Do you have
>>>>> more
>>>>>>> details about the trigger? Are there any related fixes in OVS and
>>>>> kernel?
>>>>>>>
>>>>>>
>>>>>> The original bug report came from OpenShift:
>>>>>> https://issues.redhat.com/browse/OCPBUGS-7406
>>>>>>
>>>>>> CC-ing Patryk as he root caused the original issue.
>>>>>>
>>>>>> The simplified topology is:
>>>>>>                               +--- GW1 ---+
>>>>>> client ---- cluster-router ---+           +--- (out of OVN) --- server
>>>>>>                               +--- GW2 ---+
>>>>>>
>>>>>> On cluster-router there's an ECMP route to reach 'server' via:
>>>>>> - GW1
>>>>>> - GW2
>>>>>>
>>>>>> On GW1 traffic going out of OVN is SNATed to IP1.
>>>>>> On GW2 traffic going out of OVN is SNATed to IP2.
>>>>>>
>>>>>> The client initiates a TCP session towards the server; a path is chosen
>>>>>> based on dp-hash, e.g. GW2.
>>>>>>
>>>>>> From this point on, during normal operation, traffic on this session
>>>>>> flows fine via GW2.
>>>>>>
>>>>>> If the client closes the connection, then for the closing connection
>>>>>> there's an exception when computing the dp-hash if the socket is in
>>>>>> state TIME_WAIT, substate TCP_FIN_WAIT_2:
>>>>>>
>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L2218
>>>>>> In this last case we call:
>>>>>>
>>>>>> tcp_v4_timewait_ack(sk, skb)
>>>>>> |
>>>>>> tcp_v4_send_ack(sk, skb, ...)
>>>>>>
>>>>>> The latter uses a global, per-cpu "ctl_sk" to build the last ACK
>>> packet:
>>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/net/ipv4/tcp_ipv4.c#L925
>>>>>>
>>>>>>         ctl_sk = this_cpu_read(ipv4_tcp_sk);
>>>>>>         sock_net_set(ctl_sk, net);
>>>>>>         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>>>                            inet_twsk(sk)->tw_mark : sk->sk_mark;
>>>>>>         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
>>>>>>                            inet_twsk(sk)->tw_priority :
>>> sk->sk_priority;
>>>>>>         transmit_time = tcp_transmit_time(sk);
>>>>>>         ip_send_unicast_reply(ctl_sk,
>>>>>>                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
>>>>>>                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>>>>>>                               &arg, arg.iov[0].iov_len,
>>>>>>                               transmit_time);
>>>>>>
>>>>>> As ctl_sk's sk_txhash is 0, when transmitting this last ack, its hash
>>> is
>>>>>> computed from the packet contents:
>>>>>>
>>>>>
>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/linux/skbuff.h#L1540
>>>>>>
>>>>>> static inline __u32 skb_get_hash(struct sk_buff *skb)
>>>>>> {
>>>>>>         if (!skb->l4_hash && !skb->sw_hash)
>>>>>>                 __skb_get_hash(skb);
>>>>>>
>>>>>>         return skb->hash;
>>>>>> }
>>>>>> [...]
>>>>>> void __skb_get_hash(struct sk_buff *skb)
>>>>>> {
>>>>>>         struct flow_keys keys;
>>>>>>         u32 hash;
>>>>>>
>>>>>>         __flow_hash_secret_init();
>>>>>>
>>>>>>         hash = ___skb_get_hash(skb, &keys, &hashrnd);
>>>>>>
>>>>>>         __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
>>>>>> }
>>>>>>
>>>>>> But the original socket's txhash was actually a random hash:
>>>>>> https://elixir.bootlin.com/linux/v6.2.1/source/include/net/sock.h#L2128
>>>>>>
>>>>>> static inline void sk_set_txhash(struct sock *sk)
>>>>>> {
>>>>>>         /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
>>>>>>         WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
>>>>>> }
>>>>>>
>>>>>> This means that the last ack might enter the ovs datapath with a
>>>>>> different dp_hash than all other packets sent on that session in that
>>>>>> direction. It's possible that this last ack gets hashed to a different
>>>>>> path of the ECMP route.
>>>>>>
>>>>>> This specific issue was fixed in the kernel through:
>>>>>>
>>>>>>
>>>>>
>>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#mc84997c4cc755354e5527b41b7a15869536a21f4
>>>>>>
>>>>>> However, while debugging we realized the problem is actually visible
>>>>>> with _any_ retransmit.  That's because a retransmit will trigger a
>>> rehash:
>>>>>>
>>>>>>
>>> https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_output.c#L4124
>>>>>>
>>>>>> int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
>>>>>> {
>>>>>>         const struct tcp_request_sock_ops *af_ops =
>>>>> tcp_rsk(req)->af_specific;
>>>>>>         struct flowi fl;
>>>>>>         int res;
>>>>>>
>>>>>>         /* Paired with WRITE_ONCE() in sock_setsockopt() */
>>>>>>         if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
>>>>>>                 tcp_rsk(req)->txhash = net_tx_rndhash();
>>>>>>         res = af_ops->send_synack(sk, NULL, &fl, req, NULL,
>>>>> TCP_SYNACK_NORMAL,
>>>>>>                                   NULL);
>>>>>> [...]
>>>>>>
>>>>>> So suddenly all subsequent packets (skbs) will have a different 'hash'
>>>>>> value and might get hashed to a different path.
>>>>>>
>>>>>> This last part was not fixed.  After a discussion on the netdev mailing
>>>>>> list the conclusion was to not change the hash behavior for locally
>>>>>> terminated TCP sessions:
>>>>>>
>>>>>>
>>>>>
>>> https://lore.kernel.org/all/20230511093456.672221-1-atenart@kernel.org/T/#m1fd8da6d912d5acead66c4c397396e6ac4b160dd
>>>>>>
>>>>>> Eventually the documentation change patch was dropped and a v2 of the
>>>>>> other fixes was accepted:
>>>>>> https://www.spinics.net/lists/netdev/msg906252.html
>>>>>>
>>>>>> The rehash on retransmit problem is still present though making the
>>>>>> kernel's dp-hash implementation unusable.
>>>>>>
>>>>>> During some internal discussions Aaron (in CC) pointed out that OVS
>>>>>> supports L4 symmetric hashing and that should be a stable L4 hash.  OVS
>>>>>> already reports the maximum supported hashing algorithm through the
>>>>>> Datapath.Capabilities.max_hash_alg ovsdb field.  The only problem was
>>>>>> that the kernel datapath didn't support L4 symmetric hashing.  But
>>> Aaron
>>>>>> posted a patch to implement that and the patch got accepted:
>>>>>>
>>>>>> https://www.spinics.net/lists/netdev/msg911045.html
>>>>>>
>>>>>> Then we moved OVN to use L4 symmetric hashing if available.  That's to
>>>>>> ensure correctness of the packet processing.  We did run performance
>>>>>> tests and the hit due to the extra hashing was acceptable, max 3%
>>>>>> throughput degradation in OVN scenarios that simulate common traffic
>>>>>> patterns (like ovn-kubernetes ones) - in most cases the performance
>>>>>> degradation was around 2%.
>>>>>>
>>>>>> Sorry for the long story but I hope it provides all the required
>>>>>> context.  Thinking more about it I should've probably added some of
>>> this
>>>>>> to the original commit log, I apologize.
>>>>>>
>>>>>> Regards,
>>>>>> Dumitru
>>>>>>
>>>>>
>>>>> Thanks Dumitru for the details! It is now quite clear and I am not
>>> against
>>>>> backporting it.
>>>>> But I have a question regarding upgrading. For the existing traffic,
>>> does
>>>>> this mean the packets may choose a different path after OVN upgrading?
>>>>> Should this be called out?
>>>>>
>>>>
>>>> Hi Han,
>>>>
>>>> That's a good question.  They might and this affects all traffic that's
>>>> not locally originated for the kernel datapath and (potentially) all
>>>> traffic for the userspace datapath.  I can post a follow up patch to
>>>> update the NEWS file.  Did you have anything else in mind?
>>>>
>>> Sounds good. I don't have anything else.
>>>
>>
>> I posted
>> https://patchwork.ozlabs.org/project/ovn/patch/20230814135401.2677054-1-dceara@redhat.com/
>>
> 
> I backported the commit and the NEWS update to branches 23.06 and 23.03.
>  It didn't apply cleanly on older branches but I think that's fine for
> now.  We can backport it further later if users request it.
> 

We actually got an internal request to backport this to 22.12 so I
resolved the conflicts (nothing major) and pushed these commits to
branch-22.12 too.

Regards,
Dumitru
diff mbox series

Patch

diff --git a/include/ovn/features.h b/include/ovn/features.h
index de8f1f5489..3bf536127f 100644
--- a/include/ovn/features.h
+++ b/include/ovn/features.h
@@ -34,12 +34,14 @@  enum ovs_feature_support_bits {
     OVS_CT_ZERO_SNAT_SUPPORT_BIT,
     OVS_DP_METER_SUPPORT_BIT,
     OVS_CT_TUPLE_FLUSH_BIT,
+    OVS_DP_HASH_L4_SYM_BIT,
 };
 
 enum ovs_feature_value {
     OVS_CT_ZERO_SNAT_SUPPORT = (1 << OVS_CT_ZERO_SNAT_SUPPORT_BIT),
     OVS_DP_METER_SUPPORT = (1 << OVS_DP_METER_SUPPORT_BIT),
     OVS_CT_TUPLE_FLUSH_SUPPORT = (1 << OVS_CT_TUPLE_FLUSH_BIT),
+    OVS_DP_HASH_L4_SYM_SUPPORT = (1 << OVS_DP_HASH_L4_SYM_BIT),
 };
 
 void ovs_feature_support_destroy(void);
diff --git a/lib/actions.c b/lib/actions.c
index 037172e606..9d52cb75a8 100644
--- a/lib/actions.c
+++ b/lib/actions.c
@@ -1625,6 +1625,12 @@  encode_SELECT(const struct ovnact_select *select,
     struct ds ds = DS_EMPTY_INITIALIZER;
     ds_put_format(&ds, "type=select,selection_method=dp_hash");
 
+    if (ovs_feature_is_supported(OVS_DP_HASH_L4_SYM_SUPPORT)) {
+        /* Select dp-hash l4_symmetric by setting the upper 32bits of
+         * selection_method_param to 1: */
+        ds_put_cstr(&ds, ",selection_method_param=0x100000000");
+    }
+
     struct mf_subfield sf = expr_resolve_field(&select->res_field);
 
     for (size_t bucket_id = 0; bucket_id < select->n_dsts; bucket_id++) {
diff --git a/lib/features.c b/lib/features.c
index 97c9c86f00..d24e8f6c5c 100644
--- a/lib/features.c
+++ b/lib/features.c
@@ -21,6 +21,7 @@ 
 #include "lib/dirs.h"
 #include "socket-util.h"
 #include "lib/vswitch-idl.h"
+#include "odp-netlink.h"
 #include "openvswitch/vlog.h"
 #include "openvswitch/ofpbuf.h"
 #include "openvswitch/rconn.h"
@@ -33,20 +34,48 @@  VLOG_DEFINE_THIS_MODULE(features);
 
 #define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
 
+/* Parses 'cap_name' from 'ovs_capabilities' and returns whether the
+ * type of capability is supported or not. */
+typedef bool ovs_feature_parse_func(const struct smap *ovs_capabilities,
+                                    const char *cap_name);
+
 struct ovs_feature {
     enum ovs_feature_value value;
     const char *name;
+    ovs_feature_parse_func *parse;
 };
 
+static bool
+bool_parser(const struct smap *ovs_capabilities, const char *cap_name)
+{
+    return smap_get_bool(ovs_capabilities, cap_name, false);
+}
+
+static bool
+dp_hash_l4_sym_support_parser(const struct smap *ovs_capabilities,
+                              const char *cap_name OVS_UNUSED)
+{
+    int max_hash_alg = smap_get_int(ovs_capabilities, "max_hash_alg", 0);
+
+    return max_hash_alg == OVS_HASH_ALG_SYM_L4;
+}
+
 static struct ovs_feature all_ovs_features[] = {
     {
         .value = OVS_CT_ZERO_SNAT_SUPPORT,
-        .name = "ct_zero_snat"
+        .name = "ct_zero_snat",
+        .parse = bool_parser,
     },
     {
         .value = OVS_CT_TUPLE_FLUSH_SUPPORT,
-        .name = "ct_flush"
-    }
+        .name = "ct_flush",
+        .parse = bool_parser,
+    },
+    {
+        .value = OVS_DP_HASH_L4_SYM_SUPPORT,
+        .name = "dp_hash_l4_sym_support",
+        .parse = dp_hash_l4_sym_support_parser,
+    },
 };
 
 /* A bitmap of OVS features that have been detected as 'supported'. */
@@ -65,6 +94,7 @@  ovs_feature_is_valid(enum ovs_feature_value feature)
     case OVS_CT_ZERO_SNAT_SUPPORT:
     case OVS_DP_METER_SUPPORT:
     case OVS_CT_TUPLE_FLUSH_SUPPORT:
+    case OVS_DP_HASH_L4_SYM_SUPPORT:
         return true;
     default:
         return false;
@@ -183,18 +213,17 @@  ovs_feature_support_run(const struct smap *ovs_capabilities,
     }
 
     for (size_t i = 0; i < ARRAY_SIZE(all_ovs_features); i++) {
-        enum ovs_feature_value value = all_ovs_features[i].value;
-        const char *name = all_ovs_features[i].name;
-        bool old_state = supported_ovs_features & value;
-        bool new_state = smap_get_bool(ovs_capabilities, name, false);
+        struct ovs_feature *feature = &all_ovs_features[i];
+        bool old_state = supported_ovs_features & feature->value;
+        bool new_state = feature->parse(ovs_capabilities, feature->name);
         if (new_state != old_state) {
             updated = true;
             if (new_state) {
-                supported_ovs_features |= value;
+                supported_ovs_features |= feature->value;
             } else {
-                supported_ovs_features &= ~value;
+                supported_ovs_features &= ~feature->value;
             }
-            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", name,
+            VLOG_INFO_RL(&rl, "OVS Feature: %s, state: %s", feature->name,
                          new_state ? "supported" : "not supported");
         }
     }