diff mbox series

[for-next,V2,08/10] linux/dim: Implement rdma_dim

Message ID 20190625205701.17849-9-saeedm@mellanox.com
State Accepted
Delegated to: David Miller
Headers show
Series [for-next,V2,01/10] linux/dim: Move logic to dim.h | expand

Commit Message

Saeed Mahameed June 25, 2019, 8:57 p.m. UTC
From: Yamin Friedman <yaminf@mellanox.com>

rdma_dim implements a different algorithm than net_dim and is based on
completions which is how we can implement interrupt moderation in RDMA.
The algorithm optimizes for number of completions and ratio between
completions and events.
It also has a feature for fast reduction of moderation level when the
traffic changes in such a way as to no longer require high moderation in
order to avoid long latencies.

rdma_dim will be called from the ib_core module.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/dim.h |  36 ++++++++++++++
 lib/dim/Makefile    |   6 +--
 lib/dim/rdma_dim.c  | 112 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

Comments

Sagi Grimberg June 25, 2019, 10:02 p.m. UTC | #1
> +void rdma_dim(struct dim *dim, u64 completions)
> +{
> +	struct dim_sample *curr_sample = &dim->measuring_sample;
> +	struct dim_stats curr_stats;
> +	u32 nevents;
> +
> +	dim_update_sample_with_comps(curr_sample->event_ctr + 1,
> +				     curr_sample->pkt_ctr,
> +				     curr_sample->byte_ctr,
> +				     curr_sample->comp_ctr + completions,
> +				     &dim->measuring_sample);

If this is the only caller, why add pkt_ctr and byte_ctr at all?
Or Gerlitz June 26, 2019, 11:57 a.m. UTC | #2
On Wed, Jun 26, 2019 at 1:03 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>
> > +void rdma_dim(struct dim *dim, u64 completions)
> > +{
> > +     struct dim_sample *curr_sample = &dim->measuring_sample;
> > +     struct dim_stats curr_stats;
> > +     u32 nevents;
> > +
> > +     dim_update_sample_with_comps(curr_sample->event_ctr + 1,
> > +                                  curr_sample->pkt_ctr,
> > +                                  curr_sample->byte_ctr,
> > +                                  curr_sample->comp_ctr + completions,
> > +                                  &dim->measuring_sample);
>
> If this is the only caller, why add pkt_ctr and byte_ctr at all?

Hi Sagi,

Thanks for the fast review and feedback, other than the default per
ib/rdma device setup for rdma
dim / adaptive-moderation for which Idan commented on (and lets
discuss it there please) seems
the rest of the comments are fine and Yamin will respond / address
them in the coming days.

Or.
Yamin Friedman June 27, 2019, 5:25 a.m. UTC | #3
On 6/26/2019 1:02 AM, Sagi Grimberg wrote:
>
>> +void rdma_dim(struct dim *dim, u64 completions)
>> +{
>> +    struct dim_sample *curr_sample = &dim->measuring_sample;
>> +    struct dim_stats curr_stats;
>> +    u32 nevents;
>> +
>> +    dim_update_sample_with_comps(curr_sample->event_ctr + 1,
>> +                     curr_sample->pkt_ctr,
>> +                     curr_sample->byte_ctr,
>> +                     curr_sample->comp_ctr + completions,
>> +                     &dim->measuring_sample);
>
> If this is the only caller, why add pkt_ctr and byte_ctr at all?


We wanted to keep the API general enough that if someone wants to 
implement a different algorithm using the dim library they will be able 
to use all the possible statistics. I agree though that in the rdma_dim 
function there is no point in making it seem like they are relevant 
parameters.
diff mbox series

Patch

diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa9bdd47a648..1ae32835723a 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -82,6 +82,7 @@  struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @dim_owner: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@  struct dim {
 	struct dim_sample start_sample;
 	struct dim_sample measuring_sample;
 	struct work_struct work;
+	void *dim_owner;
 	u8 profile_ix;
 	u8 mode;
 	u8 tune_state;
@@ -363,4 +365,38 @@  struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 160afe288df0..1d6858a108cb 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,8 +2,6 @@ 
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-	dim.o		\
-	net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644
index 000000000000..1bfe8f546a20
--- /dev/null
+++ b/lib/dim/rdma_dim.c
@@ -0,0 +1,112 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+	if (dim->tune_state == DIM_GOING_RIGHT) {
+		if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+	}
+	if (dim->tune_state == DIM_GOING_LEFT) {
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+	}
+
+	return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+				  struct dim_stats *prev)
+{
+	/* first stat */
+	if (!prev->cpms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+		return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+		return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_ix = dim->profile_ix;
+	u8 state = dim->tune_state;
+	int stats_res;
+	int step_res;
+
+	if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+		stats_res = rdma_dim_stats_compare(curr_stats,
+						   &dim->prev_stats);
+
+		switch (stats_res) {
+		case DIM_STATS_SAME:
+			if (curr_stats->cpe_ratio <= 50 * prev_ix)
+				dim->profile_ix = 0;
+			break;
+		case DIM_STATS_WORSE:
+			dim_turn(dim);
+			/* fall through */
+		case DIM_STATS_BETTER:
+			step_res = rdma_dim_step(dim);
+			if (step_res == DIM_ON_EDGE)
+				dim_turn(dim);
+			break;
+		}
+	}
+
+	dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+	struct dim_sample *curr_sample = &dim->measuring_sample;
+	struct dim_stats curr_stats;
+	u32 nevents;
+
+	dim_update_sample_with_comps(curr_sample->event_ctr + 1,
+				     curr_sample->pkt_ctr,
+				     curr_sample->byte_ctr,
+				     curr_sample->comp_ctr + completions,
+				     &dim->measuring_sample);
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+		if (rdma_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		dim_update_sample_with_comps(curr_sample->event_ctr,
+					     curr_sample->pkt_ctr,
+					     curr_sample->byte_ctr,
+					     curr_sample->comp_ctr,
+					     &dim->start_sample);
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(rdma_dim);