diff mbox series

[for-next,V2,10/10] RDMA/core: Provide RDMA DIM support for ULPs

Message ID 20190625205701.17849-11-saeedm@mellanox.com
State Accepted
Delegated to: David Miller
Headers show
Series [for-next,V2,01/10] linux/dim: Move logic to dim.h | expand

Commit Message

Saeed Mahameed June 25, 2019, 8:57 p.m. UTC
From: Yamin Friedman <yaminf@mellanox.com>

Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.

Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:

READS without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.8GiB/s | 7.7M | 1401  usec               | 2442  usec
4k       | 7.0GiB/s | 1.8M | 4817  usec               | 6587  usec
64k      | 10.7GiB/s| 175k | 9896  usec               | 10028 usec

IO WRITES without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.6GiB/s | 7.5M | 1434  usec               | 2474  usec
4k       | 6.3GiB/s | 1.6M | 938   usec               | 1221  usec
64k      | 10.7GiB/s| 175k | 8979  usec               | 12780 usec

IO READS with DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 4GiB/s   | 8.2M | 816    usec              | 889   usec
4k       | 10.1GiB/s| 2.65M| 3359   usec              | 5080  usec
64k      | 10.7GiB/s| 175k | 9896   usec              | 10028 usec

IO WRITES with DIM:
blk size | BW       | IOPS  | 99th percentile latency | 99.99th latency
512B     | 3.9GiB/s | 8.1M  | 799   usec              | 922   usec
4k       | 9.6GiB/s | 2.5M  | 717   usec              | 1004  usec
64k      | 10.7GiB/s| 176k  | 8586  usec              | 12256 usec

The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.

rdma_dim is configured to be the default option based on performance
improvement seen after extensive tests.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/core/cq.c      | 71 ++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/main.c |  2 +
 2 files changed, 71 insertions(+), 2 deletions(-)

Comments

Sagi Grimberg June 25, 2019, 9:14 p.m. UTC | #1
> +static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
> +{
> +	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
> +	struct dim *dim = cq->dim;
> +	int completed;
> +
> +	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
> +	if (completed < budget) {
> +		irq_poll_complete(&cq->iop);
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +			irq_poll_sched(&cq->iop);
> +	}
> +
> +	rdma_dim(dim, completed);

Why duplicate the entire thing for a one-liner?

> +
> +	return completed;
> +}
> +
>   static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
>   {
>   	irq_poll_sched(&cq->iop);
> @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
>   
>   static void ib_cq_poll_work(struct work_struct *work)
>   {
> -	struct ib_cq *cq = container_of(work, struct ib_cq, work);
> +	struct ib_cq *cq = container_of(work, struct ib_cq,
> +					work);

Why was that changed?

>   	int completed;
>   
>   	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
>   				    IB_POLL_BATCH);
> +

newline?

>   	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>   	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>   		queue_work(cq->comp_wq, &cq->work);
> +	else if (cq->dim)
> +		rdma_dim(cq->dim, completed);
>   }
>   
>   static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
> @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>   	rdma_restrack_set_task(&cq->res, caller);
>   	rdma_restrack_kadd(&cq->res);
>   
> +	rdma_dim_init(cq);
> +
>   	switch (cq->poll_ctx) {
>   	case IB_POLL_DIRECT:
>   		cq->comp_handler = ib_cq_completion_direct;
> @@ -173,7 +231,13 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>   	case IB_POLL_SOFTIRQ:
>   		cq->comp_handler = ib_cq_completion_softirq;
>   
> -		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
> +		if (cq->dim) {
> +			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
> +				      ib_poll_dim_handler);
> +		} else
> +			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
> +				      ib_poll_handler);
> +
>   		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
>   		break;
>   	case IB_POLL_WORKQUEUE:
> @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
>   		WARN_ON_ONCE(1);
>   	}
>   
> +	if (cq->dim)
> +		cancel_work_sync(&cq->dim->work);
> +	kfree(cq->dim);
>   	kfree(cq->wc);
>   	rdma_restrack_del(&cq->res);
>   	ret = cq->device->ops.destroy_cq(cq, udata);
> diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
> index abac70ad5c7c..b1b45dbe24a5 100644
> --- a/drivers/infiniband/hw/mlx5/main.c
> +++ b/drivers/infiniband/hw/mlx5/main.c
> @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
>   	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
>   		mutex_init(&dev->lb.mutex);
>   
> +	dev->ib_dev.use_cq_dim = true;
> +

Please don't. This is a bad choice to opt it in by default.
Idan Burstein June 26, 2019, 7:56 a.m. UTC | #2
" Please don't. This is a bad choice to opt it in by default."

I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency. 

Moreover, net-dim is enabled by default, I don't see why RDMA is different.


-----Original Message-----
From: linux-rdma-owner@vger.kernel.org <linux-rdma-owner@vger.kernel.org> On Behalf Of Sagi Grimberg
Sent: Wednesday, June 26, 2019 12:14 AM
To: Saeed Mahameed <saeedm@mellanox.com>; David S. Miller <davem@davemloft.net>; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@mellanox.com>
Cc: Leon Romanovsky <leonro@mellanox.com>; Or Gerlitz <ogerlitz@mellanox.com>; Tal Gilboa <talgi@mellanox.com>; netdev@vger.kernel.org; linux-rdma@vger.kernel.org; Yamin Friedman <yaminf@mellanox.com>; Max Gurtovoy <maxg@mellanox.com>
Subject: Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs



> +static int ib_poll_dim_handler(struct irq_poll *iop, int budget) {
> +	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
> +	struct dim *dim = cq->dim;
> +	int completed;
> +
> +	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
> +	if (completed < budget) {
> +		irq_poll_complete(&cq->iop);
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +			irq_poll_sched(&cq->iop);
> +	}
> +
> +	rdma_dim(dim, completed);

Why duplicate the entire thing for a one-liner?

> +
> +	return completed;
> +}
> +
>   static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
>   {
>   	irq_poll_sched(&cq->iop);
> @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct 
> ib_cq *cq, void *private)
>   
>   static void ib_cq_poll_work(struct work_struct *work)
>   {
> -	struct ib_cq *cq = container_of(work, struct ib_cq, work);
> +	struct ib_cq *cq = container_of(work, struct ib_cq,
> +					work);

Why was that changed?

>   	int completed;
>   
>   	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
>   				    IB_POLL_BATCH);
> +

newline?

>   	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>   	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>   		queue_work(cq->comp_wq, &cq->work);
> +	else if (cq->dim)
> +		rdma_dim(cq->dim, completed);
>   }
>   
>   static void ib_cq_completion_workqueue(struct ib_cq *cq, void 
> *private) @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>   	rdma_restrack_set_task(&cq->res, caller);
>   	rdma_restrack_kadd(&cq->res);
>   
> +	rdma_dim_init(cq);
> +
>   	switch (cq->poll_ctx) {
>   	case IB_POLL_DIRECT:
>   		cq->comp_handler = ib_cq_completion_direct; @@ -173,7 +231,13 @@ 
> struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>   	case IB_POLL_SOFTIRQ:
>   		cq->comp_handler = ib_cq_completion_softirq;
>   
> -		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
> +		if (cq->dim) {
> +			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
> +				      ib_poll_dim_handler);
> +		} else
> +			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
> +				      ib_poll_handler);
> +
>   		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
>   		break;
>   	case IB_POLL_WORKQUEUE:
> @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
>   		WARN_ON_ONCE(1);
>   	}
>   
> +	if (cq->dim)
> +		cancel_work_sync(&cq->dim->work);
> +	kfree(cq->dim);
>   	kfree(cq->wc);
>   	rdma_restrack_del(&cq->res);
>   	ret = cq->device->ops.destroy_cq(cq, udata); diff --git 
> a/drivers/infiniband/hw/mlx5/main.c 
> b/drivers/infiniband/hw/mlx5/main.c
> index abac70ad5c7c..b1b45dbe24a5 100644
> --- a/drivers/infiniband/hw/mlx5/main.c
> +++ b/drivers/infiniband/hw/mlx5/main.c
> @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
>   	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
>   		mutex_init(&dev->lb.mutex);
>   
> +	dev->ib_dev.use_cq_dim = true;
> +

Please don't. This is a bad choice to opt it in by default.
Yamin Friedman June 27, 2019, 5:28 a.m. UTC | #3
On 6/26/2019 12:14 AM, Sagi Grimberg wrote:
>
>
>> +static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
>> +{
>> +    struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
>> +    struct dim *dim = cq->dim;
>> +    int completed;
>> +
>> +    completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>> +    if (completed < budget) {
>> +        irq_poll_complete(&cq->iop);
>> +        if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>> +            irq_poll_sched(&cq->iop);
>> +    }
>> +
>> +    rdma_dim(dim, completed);
>
> Why duplicate the entire thing for a one-liner?
You are right, this was leftover from a previous version where there 
were more significant changes. I will remove the extra function.
>
>> +
>> +    return completed;
>> +}
>> +
>>   static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
>>   {
>>       irq_poll_sched(&cq->iop);
>> @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct 
>> ib_cq *cq, void *private)
>>     static void ib_cq_poll_work(struct work_struct *work)
>>   {
>> -    struct ib_cq *cq = container_of(work, struct ib_cq, work);
>> +    struct ib_cq *cq = container_of(work, struct ib_cq,
>> +                    work);
>
> Why was that changed?

I will fix this.

>
>>       int completed;
>>         completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, 
>> cq->wc,
>>                       IB_POLL_BATCH);
>> +
>
> newline?

Same as above.

>
>>       if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>>           ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>>           queue_work(cq->comp_wq, &cq->work);
>> +    else if (cq->dim)
>> +        rdma_dim(cq->dim, completed);
>>   }
>>     static void ib_cq_completion_workqueue(struct ib_cq *cq, void 
>> *private)
>> @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device 
>> *dev, void *private,
>>       rdma_restrack_set_task(&cq->res, caller);
>>       rdma_restrack_kadd(&cq->res);
>>   +    rdma_dim_init(cq);
>> +
>>       switch (cq->poll_ctx) {
>>       case IB_POLL_DIRECT:
>>           cq->comp_handler = ib_cq_completion_direct;
>> @@ -173,7 +231,13 @@ struct ib_cq *__ib_alloc_cq_user(struct 
>> ib_device *dev, void *private,
>>       case IB_POLL_SOFTIRQ:
>>           cq->comp_handler = ib_cq_completion_softirq;
>>   -        irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
>> +        if (cq->dim) {
>> +            irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
>> +                      ib_poll_dim_handler);
>> +        } else
>> +            irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
>> +                      ib_poll_handler);
>> +
>>           ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
>>           break;
>>       case IB_POLL_WORKQUEUE:
>> @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct 
>> ib_udata *udata)
>>           WARN_ON_ONCE(1);
>>       }
>>   +    if (cq->dim)
>> +        cancel_work_sync(&cq->dim->work);
>> +    kfree(cq->dim);
>>       kfree(cq->wc);
>>       rdma_restrack_del(&cq->res);
>>       ret = cq->device->ops.destroy_cq(cq, udata);
>> diff --git a/drivers/infiniband/hw/mlx5/main.c 
>> b/drivers/infiniband/hw/mlx5/main.c
>> index abac70ad5c7c..b1b45dbe24a5 100644
>> --- a/drivers/infiniband/hw/mlx5/main.c
>> +++ b/drivers/infiniband/hw/mlx5/main.c
>> @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct 
>> mlx5_ib_dev *dev)
>>            MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
>>           mutex_init(&dev->lb.mutex);
>>   +    dev->ib_dev.use_cq_dim = true;
>> +
>
> Please don't. This is a bad choice to opt it in by default.
Sagi Grimberg July 2, 2019, 5:36 a.m. UTC | #4
Hey Idan,

> " Please don't. This is a bad choice to opt it in by default."
> 
> I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency.

Well, its a Mellanox device driver after all.

But do note that by far, the vast majority of users are not saturating
100G of 4K I/O. The absolute vast majority of users are primarily
sensitive to synchronous QD=1 I/O latency, and when the workload
is much more dynamic than the synthetic 100%/50%/0% read mix.

As much as I'm a fan (IIRC I was the one giving a first pass at this),
the dim default opt-in is not only not beneficial, but potentially
harmful to the majority of users out-of-the-box experience.

Given that this is a fresh code with almost no exposure, and that was
not tested outside of Yamin running limited performance testing, I think
it would be a mistake to add it as a default opt-in, that can come as an
incremental stage.

Obviously, I cannot tell what Mellanox should/shouldn't do in its own
device driver of course, but I just wanted to emphasize that I think
this is a mistake.

> Moreover, net-dim is enabled by default, I don't see why RDMA is different.

Very different animals.
Leon Romanovsky July 2, 2019, 6:41 a.m. UTC | #5
On Mon, Jul 01, 2019 at 10:36:41PM -0700, Sagi Grimberg wrote:
> Hey Idan,
>
> > " Please don't. This is a bad choice to opt it in by default."
> >
> > I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency.
>
> Well, its a Mellanox device driver after all.
>
> But do note that by far, the vast majority of users are not saturating
> 100G of 4K I/O. The absolute vast majority of users are primarily
> sensitive to synchronous QD=1 I/O latency, and when the workload
> is much more dynamic than the synthetic 100%/50%/0% read mix.
>
> As much as I'm a fan (IIRC I was the one giving a first pass at this),
> the dim default opt-in is not only not beneficial, but potentially
> harmful to the majority of users out-of-the-box experience.
>
> Given that this is a fresh code with almost no exposure, and that was
> not tested outside of Yamin running limited performance testing, I think
> it would be a mistake to add it as a default opt-in, that can come as an
> incremental stage.
>
> Obviously, I cannot tell what Mellanox should/shouldn't do in its own
> device driver of course, but I just wanted to emphasize that I think
> this is a mistake.

Hi Sagi,

I'm not sharing your worries about bad out-of-the-box experience for a
number of reasons.

First of all, this code is part of upstream kernel and will take time
till users actually start to use it as is and not as part of some distro
backports or MOFED packages.

Second, Yamin did extensive testing and worked very close with Or G.
and I have very high confident in the results of their team work.

Third (outcome of first), actually the opposite is true, the setting
this option as a default will give us more time to fix/adjust code if
needed, before users will see any potential degradation.

>
> > Moreover, net-dim is enabled by default, I don't see why RDMA is different.
>
> Very different animals.

Yes and no, the logic behind is the same and both solutions have same
constrains of throughput vs. latency.

Thanks
Sagi Grimberg July 3, 2019, 6:56 p.m. UTC | #6
> Hi Sagi,
> 
> I'm not sharing your worries about bad out-of-the-box experience for a
> number of reasons.
> 
> First of all, this code is part of upstream kernel and will take time
> till users actually start to use it as is and not as part of some distro
> backports or MOFED packages.

True, but I am still saying that this feature is damaging sync IO which
represents the majority of the users. It might not be an extreme impact
but it is still a degradation (from a very limited testing I did this
morning I'm seeing a consistent 5%-10% latency increase for low QD
workloads which is consistent with what Yamin reported AFAIR).

But having said that, the call is for you guys to make as this is a
Mellanox device. I absolutely think that this is useful (as I said
before), I just don't think its necessarily a good idea to opt it by
default given that only a limited set of users would take full advantage
of it while the rest would see a negative impact (even if its 10%).

I don't have  a hard objection here, just wanted to give you my
opinion on this because mlx5 is an important driver for rdma
users.

> Second, Yamin did extensive testing and worked very close with Or G.
> and I have very high confident in the results of their team work.

Has anyone tested other RDMA ulps? NFS/RDMA or SRP/iSER?

Would be interesting to understand how other subsystems with different
characteristics behave with this.
Leon Romanovsky July 4, 2019, 7:51 a.m. UTC | #7
On Wed, Jul 03, 2019 at 11:56:04AM -0700, Sagi Grimberg wrote:
>
> > Hi Sagi,
> >
> > I'm not sharing your worries about bad out-of-the-box experience for a
> > number of reasons.
> >
> > First of all, this code is part of upstream kernel and will take time
> > till users actually start to use it as is and not as part of some distro
> > backports or MOFED packages.
>
> True, but I am still saying that this feature is damaging sync IO which
> represents the majority of the users. It might not be an extreme impact
> but it is still a degradation (from a very limited testing I did this
> morning I'm seeing a consistent 5%-10% latency increase for low QD
> workloads which is consistent with what Yamin reported AFAIR).
>
> But having said that, the call is for you guys to make as this is a
> Mellanox device. I absolutely think that this is useful (as I said
> before), I just don't think its necessarily a good idea to opt it by
> default given that only a limited set of users would take full advantage
> of it while the rest would see a negative impact (even if its 10%).
>
> I don't have  a hard objection here, just wanted to give you my
> opinion on this because mlx5 is an important driver for rdma
> users.

Your opinion is very valuable for us and we started internal thread to
challenge this "enable by default", it just takes time and I prefer to
enable this code to get test coverage as wide as possible.

>
> > Second, Yamin did extensive testing and worked very close with Or G.
> > and I have very high confident in the results of their team work.
>
> Has anyone tested other RDMA ulps? NFS/RDMA or SRP/iSER?
>
> Would be interesting to understand how other subsystems with different
> characteristics behave with this.

Me too, and I'll revert this default if needed.

Thanks
Idan Burstein July 4, 2019, 12:30 p.m. UTC | #8
The essence of the dynamic in DIM is that it would fit to the workload running on the cores. For user not to trade bandwidth/cqu% and latency with a module parameter they don't know how to config. If DIM consistently hurts latency of latency critical workloads we should debug and fix.

This is where we should go. End goal of no configurate with out of the box performance in terms of both bandwidth/cpu% and latency.

We could make several steps towards this direction if we are not mature enough today but let's define them (e.g. tests on different ulps).

-----Original Message-----
From: linux-rdma-owner@vger.kernel.org <linux-rdma-owner@vger.kernel.org> On Behalf Of Sagi Grimberg
Sent: Tuesday, July 2, 2019 8:37 AM
To: Idan Burstein <idanb@mellanox.com>; Saeed Mahameed <saeedm@mellanox.com>; David S. Miller <davem@davemloft.net>; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@mellanox.com>
Cc: Leon Romanovsky <leonro@mellanox.com>; Or Gerlitz <ogerlitz@mellanox.com>; Tal Gilboa <talgi@mellanox.com>; netdev@vger.kernel.org; linux-rdma@vger.kernel.org; Yamin Friedman <yaminf@mellanox.com>; Max Gurtovoy <maxg@mellanox.com>
Subject: Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs

Hey Idan,

> " Please don't. This is a bad choice to opt it in by default."
> 
> I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency.

Well, its a Mellanox device driver after all.

But do note that by far, the vast majority of users are not saturating 100G of 4K I/O. The absolute vast majority of users are primarily sensitive to synchronous QD=1 I/O latency, and when the workload is much more dynamic than the synthetic 100%/50%/0% read mix.

As much as I'm a fan (IIRC I was the one giving a first pass at this), the dim default opt-in is not only not beneficial, but potentially harmful to the majority of users out-of-the-box experience.

Given that this is a fresh code with almost no exposure, and that was not tested outside of Yamin running limited performance testing, I think it would be a mistake to add it as a default opt-in, that can come as an incremental stage.

Obviously, I cannot tell what Mellanox should/shouldn't do in its own device driver of course, but I just wanted to emphasize that I think this is a mistake.

> Moreover, net-dim is enabled by default, I don't see why RDMA is different.

Very different animals.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a4c81992267c..d8a8c466d897 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -26,6 +26,40 @@ 
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct ib_cq *cq = (struct ib_cq *)dim->dim_owner;
+
+	u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+	u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+	dim->state = DIM_START_MEASURE;
+
+	cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+	struct dim *dim;
+
+	if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+	    cq->poll_ctx == IB_POLL_DIRECT)
+		return;
+
+	dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+	if (!dim)
+		return;
+
+	dim->state = DIM_START_MEASURE;
+	dim->tune_state = DIM_GOING_RIGHT;
+	dim->profile_ix = RDMA_DIM_START_PROFILE;
+	dim->dim_owner = cq;
+	cq->dim = dim;
+
+	INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 			   int batch)
 {
@@ -98,6 +132,24 @@  static int ib_poll_handler(struct irq_poll *iop, int budget)
 	return completed;
 }
 
+static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
+{
+	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	struct dim *dim = cq->dim;
+	int completed;
+
+	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+	if (completed < budget) {
+		irq_poll_complete(&cq->iop);
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+			irq_poll_sched(&cq->iop);
+	}
+
+	rdma_dim(dim, completed);
+
+	return completed;
+}
+
 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 {
 	irq_poll_sched(&cq->iop);
@@ -105,14 +157,18 @@  static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 
 static void ib_cq_poll_work(struct work_struct *work)
 {
-	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	struct ib_cq *cq = container_of(work, struct ib_cq,
+					work);
 	int completed;
 
 	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
 				    IB_POLL_BATCH);
+
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		queue_work(cq->comp_wq, &cq->work);
+	else if (cq->dim)
+		rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -166,6 +222,8 @@  struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 	rdma_restrack_set_task(&cq->res, caller);
 	rdma_restrack_kadd(&cq->res);
 
+	rdma_dim_init(cq);
+
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
 		cq->comp_handler = ib_cq_completion_direct;
@@ -173,7 +231,13 @@  struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 	case IB_POLL_SOFTIRQ:
 		cq->comp_handler = ib_cq_completion_softirq;
 
-		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+		if (cq->dim) {
+			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+				      ib_poll_dim_handler);
+		} else
+			irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+				      ib_poll_handler);
+
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		break;
 	case IB_POLL_WORKQUEUE:
@@ -226,6 +290,9 @@  void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 		WARN_ON_ONCE(1);
 	}
 
+	if (cq->dim)
+		cancel_work_sync(&cq->dim->work);
+	kfree(cq->dim);
 	kfree(cq->wc);
 	rdma_restrack_del(&cq->res);
 	ret = cq->device->ops.destroy_cq(cq, udata);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abac70ad5c7c..b1b45dbe24a5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6305,6 +6305,8 @@  static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		mutex_init(&dev->lb.mutex);
 
+	dev->ib_dev.use_cq_dim = true;
+
 	return 0;
 }