diff mbox series

[for-next,9/9] RDMA/core: Provide RDMA DIM support for ULPs

Message ID 20190605232348.6452-10-saeedm@mellanox.com
State Not Applicable
Delegated to: David Miller
Headers show
Series [for-next,1/9] linux/dim: Move logic to dim.h | expand

Commit Message

Saeed Mahameed June 5, 2019, 11:24 p.m. UTC
From: Yamin Friedman <yaminf@mellanox.com>

Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.

Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:

READS without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.8GiB/s | 7.7M | 1401  usec               | 2442  usec
4k       | 7.0GiB/s | 1.8M | 4817  usec               | 6587  usec
64k      | 10.7GiB/s| 175k | 9896  usec               | 10028 usec

IO WRITES without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.6GiB/s | 7.5M | 1434  usec               | 2474  usec
4k       | 6.3GiB/s | 1.6M | 938   usec               | 1221  usec
64k      | 10.7GiB/s| 175k | 8979  usec               | 12780 usec

IO READS with DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 4GiB/s   | 8.2M | 816    usec              | 889   usec
4k       | 10.1GiB/s| 2.65M| 3359   usec              | 5080  usec
64k      | 10.7GiB/s| 175k | 9896   usec              | 10028 usec

IO WRITES with DIM:
blk size | BW       | IOPS  | 99th percentile latency | 99.99th latency
512B     | 3.9GiB/s | 8.1M  | 799   usec              | 922   usec
4k       | 9.6GiB/s | 2.5M  | 717   usec              | 1004  usec
64k      | 10.7GiB/s| 176k  | 8586  usec              | 12256 usec

The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/core/cq.c                  | 78 ++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/Kconfig    |  1 +
 .../net/ethernet/mellanox/mlx5/core/Kconfig   |  1 +
 include/rdma/ib_verbs.h                       | 27 ++++++-
 4 files changed, 102 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a4c81992267c..326d928d2763 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -14,6 +14,7 @@ 
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <rdma/ib_verbs.h>
+#include <linux/rdma_dim.h>
 
 /* # of WCs to poll for with a single call to ib_poll_cq */
 #define IB_POLL_BATCH			16
@@ -26,6 +27,32 @@ 
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct ib_cq *cq = container_of(dim, struct ib_cq, dim);
+
+	u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+	u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+	dim->state = DIM_START_MEASURE;
+
+	cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static bool rdma_dim_init(struct dim *dim, struct ib_cq *cq)
+{
+	if (!cq->device->ops.modify_cq)
+		return false;
+
+	memset(dim, 0, sizeof(*dim));
+	dim->state = DIM_START_MEASURE;
+	dim->tune_state = DIM_GOING_RIGHT;
+	dim->profile_ix = RDMA_DIM_START_PROFILE;
+
+	return true;
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 			   int batch)
 {
@@ -98,6 +125,24 @@  static int ib_poll_handler(struct irq_poll *iop, int budget)
 	return completed;
 }
 
+static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
+{
+	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	int completed;
+	struct dim *dim = &cq->dim;
+
+	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+	if (completed < budget) {
+		irq_poll_complete(&cq->iop);
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+			irq_poll_sched(&cq->iop);
+	}
+
+	rdma_dim(dim, completed);
+
+	return completed;
+}
+
 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 {
 	irq_poll_sched(&cq->iop);
@@ -105,14 +150,18 @@  static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 
 static void ib_cq_poll_work(struct work_struct *work)
 {
-	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	struct ib_cq *cq = container_of(work, struct ib_cq,
+					work);
 	int completed;
 
 	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
 				    IB_POLL_BATCH);
+
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		queue_work(cq->comp_wq, &cq->work);
+	else if (cq->dim_used)
+		rdma_dim(&cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -129,6 +178,7 @@  static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
  * @poll_ctx:		context to poll the CQ from.
  * @caller:		module owner name.
  * @udata:		Valid user data or NULL for kernel object
+ * @use_dim:		use dynamic interrupt moderation
  *
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * CQ allocated with this interface will automatically be polled from the
@@ -138,7 +188,8 @@  static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 				 int nr_cqe, int comp_vector,
 				 enum ib_poll_context poll_ctx,
-				 const char *caller, struct ib_udata *udata)
+				 const char *caller, struct ib_udata *udata,
+				 bool use_dim)
 {
 	struct ib_cq_init_attr cq_attr = {
 		.cqe		= nr_cqe,
@@ -173,13 +224,30 @@  struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 	case IB_POLL_SOFTIRQ:
 		cq->comp_handler = ib_cq_completion_softirq;
 
-		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+		if (use_dim)
+			cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+		if (cq->dim_used) {
+			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+				      ib_poll_dim_handler);
+			INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+		} else {
+			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+				      ib_poll_handler);
+		}
+
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		break;
 	case IB_POLL_WORKQUEUE:
 	case IB_POLL_UNBOUND_WORKQUEUE:
 		cq->comp_handler = ib_cq_completion_workqueue;
 		INIT_WORK(&cq->work, ib_cq_poll_work);
+		if (use_dim)
+			cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+		if (cq->dim_used)
+			INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
 				ib_comp_wq : ib_comp_unbound_wq;
@@ -217,10 +285,14 @@  void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 		break;
 	case IB_POLL_SOFTIRQ:
 		irq_poll_disable(&cq->iop);
+		if (cq->dim_used)
+			cancel_work_sync(&cq->dim.work);
 		break;
 	case IB_POLL_WORKQUEUE:
 	case IB_POLL_UNBOUND_WORKQUEUE:
 		cancel_work_sync(&cq->work);
+		if (cq->dim_used)
+			cancel_work_sync(&cq->dim.work);
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index e69c3c31e701..93cd25997b24 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -28,6 +28,7 @@  config MLX4_CORE
 	tristate
 	depends on PCI
 	select NET_DEVLINK
+	select DIMLIB
 	default n
 
 config MLX4_DEBUG
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..ef292fbb53c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -7,6 +7,7 @@  config MLX5_CORE
 	tristate "Mellanox 5th generation network adapters (ConnectX series) core driver"
 	depends on PCI
 	select NET_DEVLINK
+	select DIMLIB
 	imply PTP_1588_CLOCK
 	imply VXLAN
 	imply MLXFW
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0742095355f2..7b03fb3e4f0b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,7 @@ 
 #include <rdma/restrack.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
+#include <linux/dim.h>
 
 #define IB_FW_VERSION_NAME_MAX	ETHTOOL_FWVERS_LEN
 
@@ -1638,6 +1639,8 @@  struct ib_cq {
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
 	struct rdma_restrack_entry res;
+	struct dim		dim;
+	bool			dim_used;
 };
 
 struct ib_srq {
@@ -3746,7 +3749,8 @@  static inline int ib_post_recv(struct ib_qp *qp,
 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 				 int nr_cqe, int comp_vector,
 				 enum ib_poll_context poll_ctx,
-				 const char *caller, struct ib_udata *udata);
+				 const char *caller, struct ib_udata *udata,
+				 bool use_dim);
 
 /**
  * ib_alloc_cq_user: Allocate kernel/user CQ
@@ -3764,7 +3768,7 @@  static inline struct ib_cq *ib_alloc_cq_user(struct ib_device *dev,
 					     struct ib_udata *udata)
 {
 	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
-				  KBUILD_MODNAME, udata);
+				  KBUILD_MODNAME, udata, false);
 }
 
 /**
@@ -3785,6 +3789,25 @@  static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 				NULL);
 }
 
+/**
+ * ib_alloc_cq_dim: Allocate kernel CQ with dynamic interrupt moderation
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @comp_vector: Completion vector used for the IRQs
+ * @poll_ctx: Context used for polling the CQ
+ *
+ * NOTE: for user cq use ib_alloc_cq_user with valid udata!
+ */
+static inline struct ib_cq *ib_alloc_cq_dim(struct ib_device *dev,
+					    void *private, int nr_cqe,
+					    int comp_vector,
+					    enum ib_poll_context poll_ctx)
+{
+	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+				  KBUILD_MODNAME, NULL, true);
+}
+
 /**
  * ib_free_cq_user - Free kernel/user CQ
  * @cq: The CQ to free