diff mbox series

[net-next,08/14] net/mlx5: E-Switch, Add chains and priorities

Message ID 20181018000859.16212-9-saeedm@mellanox.com
State Accepted, archived
Delegated to: David Miller
Headers show
Series [net-next,01/14] net/mlx5: E-Switch, Get counters for offloaded flows from callers | expand

Commit Message

Saeed Mahameed Oct. 18, 2018, 12:08 a.m. UTC
From: Paul Blakey <paulb@mellanox.com>

A chain is a group of priorities, so use the fdb parallel
sub namespaces to implement chains, and a flow table for each
priority in them.

Because these namespaces are parallel and in series to the slow path
fdb, the chains aren't connected to one another (but to the slow path),
and one must use a explicit goto action to reach a different chain.

Flow tables for the priorities will be created on demand and destroyed
once not used.

The Firmware has four pools of tables for sizes S/XS/M/L (4k, 64k, 1m, 4m).
We maintain ghost copies of the pools occupancy.

When a new table is to be created, we scan the pools from large to small
and find the 1st table size which can be now created. When a table is
destroyed, we update the relevant pool.

Multi chain/prio isn't enabled yet by this patch, for now all flows
will use the default chain 0, and prio 1.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |   3 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  32 +-
 .../mellanox/mlx5/core/eswitch_offloads.c     | 390 ++++++++++++++----
 3 files changed, 339 insertions(+), 86 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 7487bdd55f23..6c04e11f9a05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -837,6 +837,9 @@  mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5e_priv *out_priv;
 	int err = 0, encap_err = 0;
 
+	/* keep the old behaviour, use same prio for all offloaded rules */
+	attr->prio = 1;
+
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
 		out_dev = __dev_get_by_index(dev_net(priv->netdev),
 					     attr->parse_attr->mirred_ifindex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 584e735bbad1..54215f4312fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -123,6 +123,13 @@  struct mlx5_vport {
 	u16                     enabled_events;
 };
 
+enum offloads_fdb_flags {
+	ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED = BIT(0),
+};
+
+extern const unsigned int ESW_POOLS[4];
+
+#define PRIO_LEVELS 2
 struct mlx5_eswitch_fdb {
 	union {
 		struct legacy_fdb {
@@ -133,16 +140,24 @@  struct mlx5_eswitch_fdb {
 		} legacy;
 
 		struct offloads_fdb {
-			struct mlx5_flow_table *fast_fdb;
-			struct mlx5_flow_table *fwd_fdb;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
 			struct mlx5_flow_handle *miss_rule_uni;
 			struct mlx5_flow_handle *miss_rule_multi;
 			int vlan_push_pop_refcount;
+
+			struct {
+				struct mlx5_flow_table *fdb;
+				u32 num_rules;
+			} fdb_prio[FDB_MAX_CHAIN + 1][FDB_MAX_PRIO + 1][PRIO_LEVELS];
+			/* Protects fdb_prio table */
+			struct mutex fdb_prio_lock;
+
+			int fdb_left[ARRAY_SIZE(ESW_POOLS)];
 		} offloads;
 	};
+	u32 flags;
 };
 
 struct mlx5_esw_offload {
@@ -184,6 +199,7 @@  struct mlx5_eswitch {
 
 	struct mlx5_esw_offload offloads;
 	int                     mode;
+	int                     nvports;
 };
 
 void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
@@ -236,6 +252,15 @@  mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 			  struct mlx5_flow_handle *rule,
 			  struct mlx5_esw_flow_attr *attr);
 
+bool
+mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw);
+
+u16
+mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw);
+
+u32
+mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw);
+
 struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport,
 				  struct mlx5_flow_destination *dest);
@@ -274,6 +299,9 @@  struct mlx5_esw_flow_attr {
 	u32	mod_hdr_id;
 	u8	match_level;
 	struct mlx5_fc *counter;
+	u32	chain;
+	u16	prio;
+	u32	dest_chain;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 983bb8a80f75..8501b6c31c02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -37,32 +37,59 @@ 
 #include <linux/mlx5/fs.h>
 #include "mlx5_core.h"
 #include "eswitch.h"
+#include "en.h"
+#include "fs_core.h"
 
 enum {
 	FDB_FAST_PATH = 0,
 	FDB_SLOW_PATH
 };
 
+#define fdb_prio_table(esw, chain, prio, level) \
+	(esw)->fdb_table.offloads.fdb_prio[(chain)][(prio)][(level)]
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+
+bool mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw)
+{
+	return (!!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED));
+}
+
+u32 mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw)
+{
+	if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+		return FDB_MAX_CHAIN;
+
+	return 0;
+}
+
+u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw)
+{
+	if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+		return FDB_MAX_PRIO;
+
+	return U16_MAX;
+}
+
 struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	bool mirror = !!(attr->mirror_count);
 	struct mlx5_flow_act flow_act = {0};
-	struct mlx5_flow_table *ft = NULL;
 	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_table *fdb;
 	int j, i = 0;
 	void *misc;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	if (attr->mirror_count)
-		ft = esw->fdb_table.offloads.fwd_fdb;
-	else
-		ft = esw->fdb_table.offloads.fast_fdb;
-
 	flow_act.action = attr->action;
 	/* if per flow vlan pop/push is emulated, don't set that into the firmware */
 	if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
@@ -80,13 +107,28 @@  mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	}
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		for (j = attr->mirror_count; j < attr->out_count; j++) {
-			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-			dest[i].vport.num = attr->out_rep[j]->vport;
-			dest[i].vport.vhca_id =
-				MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
-			dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+		if (attr->dest_chain) {
+			struct mlx5_flow_table *ft;
+
+			ft = esw_get_prio_table(esw, attr->dest_chain, 1, 0);
+			if (IS_ERR(ft)) {
+				rule = ERR_CAST(ft);
+				goto err_create_goto_table;
+			}
+
+			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			dest[i].ft = ft;
 			i++;
+		} else {
+			for (j = attr->mirror_count; j < attr->out_count; j++) {
+				dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+				dest[i].vport.num = attr->out_rep[j]->vport;
+				dest[i].vport.vhca_id =
+					MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+				dest[i].vport.vhca_id_valid =
+					!!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+				i++;
+			}
 		}
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
@@ -124,13 +166,26 @@  mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
 		flow_act.reformat_id = attr->encap_id;
 
-	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
+	fdb = esw_get_prio_table(esw, attr->chain, attr->prio, !!mirror);
+	if (IS_ERR(fdb)) {
+		rule = ERR_CAST(fdb);
+		goto err_esw_get;
+	}
+
+	rule = mlx5_add_flow_rules(fdb, spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
-		goto out;
+		goto err_add_rule;
 	else
 		esw->offloads.num_flows++;
 
-out:
+	return rule;
+
+err_add_rule:
+	esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+err_esw_get:
+	if (attr->dest_chain)
+		esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+err_create_goto_table:
 	return rule;
 }
 
@@ -141,10 +196,24 @@  mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
 	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_table *fast_fdb;
+	struct mlx5_flow_table *fwd_fdb;
 	struct mlx5_flow_handle *rule;
 	void *misc;
 	int i;
 
+	fast_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 0);
+	if (IS_ERR(fast_fdb)) {
+		rule = ERR_CAST(fast_fdb);
+		goto err_get_fast;
+	}
+
+	fwd_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 1);
+	if (IS_ERR(fwd_fdb)) {
+		rule = ERR_CAST(fwd_fdb);
+		goto err_get_fwd;
+	}
+
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	for (i = 0; i < attr->mirror_count; i++) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
@@ -154,7 +223,7 @@  mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 		dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
 	}
 	dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-	dest[i].ft = esw->fdb_table.offloads.fwd_fdb,
+	dest[i].ft = fwd_fdb,
 	i++;
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
@@ -177,21 +246,49 @@  mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
 					      MLX5_MATCH_MISC_PARAMETERS;
 
-	rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fast_fdb, spec, &flow_act, dest, i);
+	rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i);
 
-	if (!IS_ERR(rule))
-		esw->offloads.num_flows++;
+	if (IS_ERR(rule))
+		goto add_err;
 
+	esw->offloads.num_flows++;
+
+	return rule;
+add_err:
+	esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+err_get_fwd:
+	esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+err_get_fast:
 	return rule;
 }
 
+static void
+__mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
+			struct mlx5_flow_handle *rule,
+			struct mlx5_esw_flow_attr *attr,
+			bool fwd_rule)
+{
+	bool mirror = (attr->mirror_count > 0);
+
+	mlx5_del_flow_rules(rule);
+	esw->offloads.num_flows--;
+
+	if (fwd_rule)  {
+		esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+		esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+	} else {
+		esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+		if (attr->dest_chain)
+			esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+	}
+}
+
 void
 mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
 				struct mlx5_esw_flow_attr *attr)
 {
-	mlx5_del_flow_rules(rule);
-	esw->offloads.num_flows--;
+	__mlx5_eswitch_del_rule(esw, rule, attr, false);
 }
 
 void
@@ -199,7 +296,7 @@  mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 			  struct mlx5_flow_handle *rule,
 			  struct mlx5_esw_flow_attr *attr)
 {
-	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+	__mlx5_eswitch_del_rule(esw, rule, attr, true);
 }
 
 static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
@@ -288,7 +385,8 @@  int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
 
 	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
 	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
-	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+	fwd  = !!((attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
+		   !attr->dest_chain);
 
 	err = esw_add_vlan_action_check(attr, push, pop, fwd);
 	if (err)
@@ -495,75 +593,164 @@  static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 
 #define ESW_OFFLOADS_NUM_GROUPS  4
 
-static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+/* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS),
+ * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated
+ * for each flow table pool. We can allocate up to 16M of each pool,
+ * and we keep track of how much we used via put/get_sz_to_pool.
+ * Firmware doesn't report any of this for now.
+ * ESW_POOL is expected to be sorted from large to small
+ */
+#define ESW_SIZE (16 * 1024 * 1024)
+const unsigned int ESW_POOLS[4] = { 4 * 1024 * 1024, 1 * 1024 * 1024,
+				    64 * 1024, 4 * 1024 };
+
+static int
+get_sz_from_pool(struct mlx5_eswitch *esw)
+{
+	int sz = 0, i;
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+		if (esw->fdb_table.offloads.fdb_left[i]) {
+			--esw->fdb_table.offloads.fdb_left[i];
+			sz = ESW_POOLS[i];
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static void
+put_sz_to_pool(struct mlx5_eswitch *esw, int sz)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+		if (sz >= ESW_POOLS[i]) {
+			++esw->fdb_table.offloads.fdb_left[i];
+			break;
+		}
+	}
+}
+
+static struct mlx5_flow_table *
+create_next_size_table(struct mlx5_eswitch *esw,
+		       struct mlx5_flow_namespace *ns,
+		       u16 table_prio,
+		       int level,
+		       u32 flags)
+{
+	struct mlx5_flow_table *fdb;
+	int sz;
+
+	sz = get_sz_from_pool(esw);
+	if (!sz)
+		return ERR_PTR(-ENOSPC);
+
+	fdb = mlx5_create_auto_grouped_flow_table(ns,
+						  table_prio,
+						  sz,
+						  ESW_OFFLOADS_NUM_GROUPS,
+						  level,
+						  flags);
+	if (IS_ERR(fdb)) {
+		esw_warn(esw->dev, "Failed to create FDB Table err %d (table prio: %d, level: %d, size: %d)\n",
+			 (int)PTR_ERR(fdb), table_prio, level, sz);
+		put_sz_to_pool(esw, sz);
+	}
+
+	return fdb;
+}
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 {
 	struct mlx5_core_dev *dev = esw->dev;
-	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb = NULL;
-	int esw_size, err = 0;
+	struct mlx5_flow_namespace *ns;
+	int table_prio, l = 0;
 	u32 flags = 0;
-	u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
-				MLX5_CAP_GEN(dev, max_flow_counter_15_0);
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
-	if (!root_ns) {
-		esw_warn(dev, "Failed to get FDB flow namespace\n");
-		err = -EOPNOTSUPP;
-		goto out_namespace;
-	}
-
-	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
-		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
-		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
+	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
 
-	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
-			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+	fdb = fdb_prio_table(esw, chain, prio, level).fdb;
+	if (fdb) {
+		/* take ref on earlier levels as well */
+		while (level >= 0)
+			fdb_prio_table(esw, chain, prio, level--).num_rules++;
+		mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+		return fdb;
+	}
 
-	if (mlx5_esw_has_fwd_fdb(dev))
-		esw_size >>= 1;
+	ns = mlx5_get_fdb_sub_ns(dev, chain);
+	if (!ns) {
+		esw_warn(dev, "Failed to get FDB sub namespace\n");
+		mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
 		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
-						  esw_size,
-						  ESW_OFFLOADS_NUM_GROUPS, 0,
-						  flags);
-	if (IS_ERR(fdb)) {
-		err = PTR_ERR(fdb);
-		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
-		goto out_namespace;
-	}
-	esw->fdb_table.offloads.fast_fdb = fdb;
+	table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
 
-	if (!mlx5_esw_has_fwd_fdb(dev))
-		goto out_namespace;
+	/* create earlier levels for correct fs_core lookup when
+	 * connecting tables
+	 */
+	for (l = 0; l <= level; l++) {
+		if (fdb_prio_table(esw, chain, prio, l).fdb) {
+			fdb_prio_table(esw, chain, prio, l).num_rules++;
+			continue;
+		}
 
-	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
-						  esw_size,
-						  ESW_OFFLOADS_NUM_GROUPS, 1,
-						  flags);
-	if (IS_ERR(fdb)) {
-		err = PTR_ERR(fdb);
-		esw_warn(dev, "Failed to create fwd table err %d\n", err);
-		goto out_ft;
+		fdb = create_next_size_table(esw, ns, table_prio, l, flags);
+		if (IS_ERR(fdb)) {
+			l--;
+			goto err_create_fdb;
+		}
+
+		fdb_prio_table(esw, chain, prio, l).fdb = fdb;
+		fdb_prio_table(esw, chain, prio, l).num_rules = 1;
 	}
-	esw->fdb_table.offloads.fwd_fdb = fdb;
 
-	return err;
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+	return fdb;
 
-out_ft:
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
-out_namespace:
-	return err;
+err_create_fdb:
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+	if (l >= 0)
+		esw_put_prio_table(esw, chain, prio, l);
+
+	return fdb;
 }
 
-static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 {
-	if (mlx5_esw_has_fwd_fdb(esw->dev))
-		mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+	int l;
+
+	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
+
+	for (l = level; l >= 0; l--) {
+		if (--(fdb_prio_table(esw, chain, prio, l).num_rules) > 0)
+			continue;
+
+		put_sz_to_pool(esw, fdb_prio_table(esw, chain, prio, l).fdb->max_fte);
+		mlx5_destroy_flow_table(fdb_prio_table(esw, chain, prio, l).fdb);
+		fdb_prio_table(esw, chain, prio, l).fdb = NULL;
+	}
+
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+}
+
+static void esw_destroy_offloads_fast_fdb_tables(struct mlx5_eswitch *esw)
+{
+	/* If lazy creation isn't supported, deref the fast path tables */
+	if (!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)) {
+		esw_put_prio_table(esw, 0, 1, 1);
+		esw_put_prio_table(esw, 0, 1, 0);
+	}
 }
 
 #define MAX_PF_SQ 256
@@ -574,12 +761,13 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_core_dev *dev = esw->dev;
+	u32 *flow_group_in, max_flow_counter;
 	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb = NULL;
-	int table_size, ix, err = 0;
+	int table_size, ix, err = 0, i;
 	struct mlx5_flow_group *g;
+	u32 flags = 0, fdb_max;
 	void *match_criteria;
-	u32 *flow_group_in;
 	u8 *dmac;
 
 	esw_debug(esw->dev, "Create offloads FDB Tables\n");
@@ -594,12 +782,29 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 		goto ns_err;
 	}
 
-	err = esw_create_offloads_fast_fdb_table(esw);
-	if (err)
-		goto fast_fdb_err;
+	max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+			    MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+	fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
+
+	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d), groups(%d), max flow table size(2^%d))\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
+		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS,
+		  fdb_max);
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++)
+		esw->fdb_table.offloads.fdb_left[i] =
+			ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0;
 
 	table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + 2;
 
+	/* create the slow path fdb with encap set, so further table instances
+	 * can be created at run time while VFs are probed if the FW allows that.
+	 */
+	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
+		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
+			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
+
+	ft_attr.flags = flags;
 	ft_attr.max_fte = table_size;
 	ft_attr.prio = FDB_SLOW_PATH;
 
@@ -611,6 +816,18 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	}
 	esw->fdb_table.offloads.slow_fdb = fdb;
 
+	/* If lazy creation isn't supported, open the fast path tables now */
+	if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) &&
+	    esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) {
+		esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+		esw_warn(dev, "Lazy creation of flow tables isn't supported, ignoring priorities\n");
+		esw_get_prio_table(esw, 0, 1, 0);
+		esw_get_prio_table(esw, 0, 1, 1);
+	} else {
+		esw_debug(dev, "Lazy creation of flow tables supported, deferring table opening\n");
+		esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+	}
+
 	/* create send-to-vport group */
 	memset(flow_group_in, 0, inlen);
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -658,6 +875,7 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	if (err)
 		goto miss_rule_err;
 
+	esw->nvports = nvports;
 	kvfree(flow_group_in);
 	return 0;
 
@@ -666,10 +884,9 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 miss_err:
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 send_vport_err:
+	esw_destroy_offloads_fast_fdb_tables(esw);
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
-	esw_destroy_offloads_fast_fdb_table(esw);
-fast_fdb_err:
 ns_err:
 	kvfree(flow_group_in);
 	return err;
@@ -677,7 +894,7 @@  static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 
 static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 {
-	if (!esw->fdb_table.offloads.fast_fdb)
+	if (!esw->fdb_table.offloads.slow_fdb)
 		return;
 
 	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
@@ -687,7 +904,7 @@  static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
 
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
-	esw_destroy_offloads_fast_fdb_table(esw);
+	esw_destroy_offloads_fast_fdb_tables(esw);
 }
 
 static int esw_create_offloads_table(struct mlx5_eswitch *esw)
@@ -944,6 +1161,8 @@  int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
 {
 	int err;
 
+	mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
+
 	err = esw_create_offloads_fdb_tables(esw, nvports);
 	if (err)
 		return err;
@@ -1272,16 +1491,19 @@  int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
 		return -EOPNOTSUPP;
 	}
 
-	esw_destroy_offloads_fast_fdb_table(esw);
+	esw_destroy_offloads_fdb_tables(esw);
 
 	esw->offloads.encap = encap;
-	err = esw_create_offloads_fast_fdb_table(esw);
+
+	err = esw_create_offloads_fdb_tables(esw, esw->nvports);
+
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed re-creating fast FDB table");
 		esw->offloads.encap = !encap;
-		(void)esw_create_offloads_fast_fdb_table(esw);
+		(void)esw_create_offloads_fdb_tables(esw, esw->nvports);
 	}
+
 	return err;
 }