diff mbox series

[net-next,02/14] net/mlx5e: RX, Support multiple outstanding UMR posts

Message ID 20190422223306.31568-3-saeedm@mellanox.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [net-next,01/14] net/mlx5e: RX, Add a prefetch command for small L1_CACHE_BYTES | expand

Commit Message

Saeed Mahameed April 22, 2019, 10:32 p.m. UTC
From: Tariq Toukan <tariqt@mellanox.com>

The buffers mapping of the Multi-Packet WQEs (of Striding RQ)
is done via UMR posts, one UMR WQE per an RX MPWQE.

A single MPWQE is capable of serving many incoming packets,
usually larger than the budget of a single napi cycle.
Hence, posting a single UMR WQE per napi cycle (and handling its
completion in the next cycle) works fine in many common cases,
but not always.

When an XDP program is loaded, every MPWQE is capable of serving less
packets, to satisfy the packet-per-page requirement.
Thus, for the same number of packets more MPWQEs (and UMR posts)
are needed (twice as much for the default MTU), giving less latency
room for the UMR completions.

In this patch, we add support for multiple outstanding UMR posts,
to allow faster gap closure between consuming MPWQEs and reposting
them back into the WQ.

For better SW and HW locality, we combine the UMR posts in bulks of
(at least) two.

This is expected to improve packet rate in high CPU scale.

Performance test:
As expected, huge improvement in large-scale (48 cores).

xdp_redirect_map, 64B UDP multi-stream.
Redirect from ConnectX-5 100Gbps to ConnectX-6 100Gbps.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz.

Before: Unstable, 7 to 30 Mpps
After:  Stable,   at 70.5 Mpps

No degradation in other tested scenarios.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  36 ++++-
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 130 ++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  |  12 ++
 4 files changed, 136 insertions(+), 52 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6147be23a9b9..6b9f661a0577 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -462,10 +462,10 @@  struct mlx5e_xdpsq {
 
 struct mlx5e_icosq {
 	/* data path */
+	u16                        cc;
+	u16                        pc;
 
-	/* dirtied @xmit */
-	u16                        pc ____cacheline_aligned_in_smp;
-
+	struct mlx5_wqe_ctrl_seg  *doorbell_cseg;
 	struct mlx5e_cq            cq;
 
 	/* write@xmit, read@completion */
@@ -563,8 +563,10 @@  struct mlx5e_rq {
 			struct mlx5e_mpw_info *info;
 			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
 			u16                    num_strides;
+			u16                    actual_wq_head;
 			u8                     log_stride_sz;
-			bool                   umr_in_progress;
+			u8                     umr_in_progress;
+			u8                     umr_last_bulk;
 		} mpwqe;
 	};
 	struct {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5c127fccad60..7ab195ac7299 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -903,10 +903,14 @@  static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 
 	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+		u16 head = wq->head;
+		int i;
 
-		/* UMR WQE (if in progress) is always at wq->head */
-		if (rq->mpwqe.umr_in_progress)
-			rq->dealloc_wqe(rq, wq->head);
+		/* Outstanding UMR WQEs (in progress) start at wq->head */
+		for (i = 0; i < rq->mpwqe.umr_in_progress; i++) {
+			rq->dealloc_wqe(rq, head);
+			head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
+		}
 
 		while (!mlx5_wq_ll_is_empty(wq)) {
 			struct mlx5e_rx_wqe_ll *wqe;
@@ -1092,7 +1096,7 @@  static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq)
 
 static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa)
 {
-	u8 wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 
 	sq->db.ico_wqe = kvzalloc_node(array_size(wq_sz,
 						  sizeof(*sq->db.ico_wqe)),
@@ -2108,6 +2112,13 @@  static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
 	return order_base_2(sz);
 }
 
+static u8 mlx5e_get_rq_log_wq_sz(void *rqc)
+{
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	return MLX5_GET(wq, wq, log_wq_sz);
+}
+
 static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 				 struct mlx5e_params *params,
 				 struct mlx5e_rq_param *param)
@@ -2274,13 +2285,28 @@  static void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
 	param->is_mpw = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE);
 }
 
+static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5e_params *params,
+				      struct mlx5e_rq_param *rqp)
+{
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return order_base_2(MLX5E_UMR_WQEBBS) +
+			mlx5e_get_rq_log_wq_sz(rqp->rqc);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+	}
+}
+
 static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
 				      struct mlx5e_params *params,
 				      struct mlx5e_channel_param *cparam)
 {
-	u8 icosq_log_wq_sz = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+	u8 icosq_log_wq_sz;
 
 	mlx5e_build_rq_param(priv, params, &cparam->rq);
+
+	icosq_log_wq_sz = mlx5e_build_icosq_log_wq_sz(params, &cparam->rq);
+
 	mlx5e_build_sq_param(priv, params, &cparam->sq);
 	mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
 	mlx5e_build_icosq_param(priv, icosq_log_wq_sz, &cparam->icosq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 0148ca6ed4ae..0f3da3ae858f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -408,14 +408,15 @@  mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle
 			mlx5e_page_release(rq, &dma_info[i], recycle);
 }
 
-static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
+static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n)
 {
 	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
-	struct mlx5e_rx_wqe_ll *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
-	rq->mpwqe.umr_in_progress = false;
+	do {
+		u16 next_wqe_index = mlx5_wq_ll_get_wqe_next_ix(wq, wq->head);
 
-	mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+		mlx5_wq_ll_push(wq, next_wqe_index);
+	} while (--n);
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
@@ -425,7 +426,7 @@  static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 
 static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
 {
-	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+	return mlx5_wq_cyc_get_ctr_wrap_cnt(&sq->wq, sq->pc);
 }
 
 static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
@@ -477,8 +478,6 @@  static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 	wi->consumed_strides = 0;
 
-	rq->mpwqe.umr_in_progress = true;
-
 	umr_wqe->ctrl.opmod_idx_opcode =
 		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
 			    MLX5_OPCODE_UMR);
@@ -486,7 +485,8 @@  static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 	sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
 	sq->pc += MLX5E_UMR_WQEBBS;
-	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &umr_wqe->ctrl);
+
+	sq->doorbell_cseg = &umr_wqe->ctrl;
 
 	return 0;
 
@@ -541,37 +541,13 @@  bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	return !!err;
 }
 
-static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
-					     struct mlx5e_icosq *sq,
-					     struct mlx5e_rq *rq,
-					     struct mlx5_cqe64 *cqe)
-{
-	struct mlx5_wq_cyc *wq = &sq->wq;
-	u16 ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
-	struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
-
-	mlx5_cqwq_pop(&cq->wq);
-
-	if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
-		netdev_WARN_ONCE(cq->channel->netdev,
-				 "Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe));
-		return;
-	}
-
-	if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
-		mlx5e_post_rx_mpwqe(rq);
-		return;
-	}
-
-	if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
-		netdev_WARN_ONCE(cq->channel->netdev,
-				 "Bad OPCODE in ICOSQ WQE info: 0x%x\n", icowi->opcode);
-}
-
 static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 {
 	struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
 	struct mlx5_cqe64 *cqe;
+	u8  completed_umr = 0;
+	u16 sqcc;
+	int i;
 
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
 		return;
@@ -580,28 +556,96 @@  static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 	if (likely(!cqe))
 		return;
 
-	/* by design, there's only a single cqe */
-	mlx5e_poll_ico_single_cqe(cq, sq, rq, cqe);
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	i = 0;
+	do {
+		u16 wqe_counter;
+		bool last_wqe;
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		wqe_counter = be16_to_cpu(cqe->wqe_counter);
+
+		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
+			netdev_WARN_ONCE(cq->channel->netdev,
+					 "Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe));
+			break;
+		}
+		do {
+			struct mlx5e_sq_wqe_info *wi;
+			u16 ci;
+
+			last_wqe = (sqcc == wqe_counter);
+
+			ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
+			wi = &sq->db.ico_wqe[ci];
+
+			if (likely(wi->opcode == MLX5_OPCODE_UMR)) {
+				sqcc += MLX5E_UMR_WQEBBS;
+				completed_umr++;
+			} else if (likely(wi->opcode == MLX5_OPCODE_NOP)) {
+				sqcc++;
+			} else {
+				netdev_WARN_ONCE(cq->channel->netdev,
+						 "Bad OPCODE in ICOSQ WQE info: 0x%x\n",
+						 wi->opcode);
+			}
+
+		} while (!last_wqe);
+
+	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	sq->cc = sqcc;
 
 	mlx5_cqwq_update_db_record(&cq->wq);
+
+	if (likely(completed_umr)) {
+		mlx5e_post_rx_mpwqe(rq, completed_umr);
+		rq->mpwqe.umr_in_progress -= completed_umr;
+	}
 }
 
 bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 {
+	struct mlx5e_icosq *sq = &rq->channel->icosq;
 	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+	u8  missing, i;
+	u16 head;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	mlx5e_poll_ico_cq(&rq->channel->icosq.cq, rq);
+	mlx5e_poll_ico_cq(&sq->cq, rq);
+
+	missing = mlx5_wq_ll_missing(wq) - rq->mpwqe.umr_in_progress;
 
-	if (mlx5_wq_ll_is_full(wq))
+	if (unlikely(rq->mpwqe.umr_in_progress > rq->mpwqe.umr_last_bulk))
+		rq->stats->congst_umr++;
+
+#define UMR_WQE_BULK (2)
+	if (likely(missing < UMR_WQE_BULK))
 		return false;
 
-	if (!rq->mpwqe.umr_in_progress)
-		mlx5e_alloc_rx_mpwqe(rq, wq->head);
-	else
-		rq->stats->congst_umr += mlx5_wq_ll_missing(wq) > 2;
+	head = rq->mpwqe.actual_wq_head;
+	i = missing;
+	do {
+		if (unlikely(mlx5e_alloc_rx_mpwqe(rq, head)))
+			break;
+		head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
+	} while (--i);
+
+	rq->mpwqe.umr_last_bulk    = missing - i;
+	if (sq->doorbell_cseg) {
+		mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg);
+		sq->doorbell_cseg = NULL;
+	}
+
+	rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
+	rq->mpwqe.actual_wq_head   = head;
 
 	return false;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index ea934a48c90a..1f87cce421e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -134,6 +134,11 @@  static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
 	*wq->db = cpu_to_be32(wq->wqe_ctr);
 }
 
+static inline u16 mlx5_wq_cyc_get_ctr_wrap_cnt(struct mlx5_wq_cyc *wq, u16 ctr)
+{
+	return ctr >> wq->fbc.log_sz;
+}
+
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
 	return ctr & wq->fbc.sz_m1;
@@ -243,6 +248,13 @@  static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
+static inline u16 mlx5_wq_ll_get_wqe_next_ix(struct mlx5_wq_ll *wq, u16 ix)
+{
+	struct mlx5_wqe_srq_next_seg *wqe = mlx5_wq_ll_get_wqe(wq, ix);
+
+	return be16_to_cpu(wqe->next_wqe_index);
+}
+
 static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next)
 {
 	wq->head = head_next;