[v2] rdma: Enable ib_alloc_cq to spread work over a device's comp_vectors
diff mbox series

Message ID 20190728163027.3637.70740.stgit@manet.1015granger.net
State New
Headers show
Series
  • [v2] rdma: Enable ib_alloc_cq to spread work over a device's comp_vectors
Related show

Commit Message

Chuck Lever July 28, 2019, 4:30 p.m. UTC
Send and Receive completion is handled on a single CPU selected at
the time each Completion Queue is allocated. Typically this is when
an initiator instantiates an RDMA transport, or when a target
accepts an RDMA connection.

Some ULPs cannot open a connection per CPU to spread completion
workload across available CPUs and MSI vectors. For such ULPs,
provide an API that allows the RDMA core to select a completion
vector based on the device's complement of available comp_vecs.

ULPs that invoke ib_alloc_cq() with only comp_vector 0 are converted
to use the new API so that their completion workloads interfere less
with each other.

Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <linux-cifs@vger.kernel.org>
Cc: <v9fs-developer@lists.sourceforge.net>
---
 drivers/infiniband/core/cq.c             |   29 +++++++++++++++++++++++++++++
 drivers/infiniband/ulp/srpt/ib_srpt.c    |    4 ++--
 fs/cifs/smbdirect.c                      |   10 ++++++----
 include/rdma/ib_verbs.h                  |   19 +++++++++++++++++++
 net/9p/trans_rdma.c                      |    6 +++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c |    8 ++++----
 net/sunrpc/xprtrdma/verbs.c              |   13 ++++++-------
 7 files changed, 69 insertions(+), 20 deletions(-)

Comments

Leon Romanovsky July 29, 2019, 5:49 a.m. UTC | #1
On Sun, Jul 28, 2019 at 12:30:27PM -0400, Chuck Lever wrote:
> Send and Receive completion is handled on a single CPU selected at
> the time each Completion Queue is allocated. Typically this is when
> an initiator instantiates an RDMA transport, or when a target
> accepts an RDMA connection.
>
> Some ULPs cannot open a connection per CPU to spread completion
> workload across available CPUs and MSI vectors. For such ULPs,
> provide an API that allows the RDMA core to select a completion
> vector based on the device's complement of available comp_vecs.
>
> ULPs that invoke ib_alloc_cq() with only comp_vector 0 are converted
> to use the new API so that their completion workloads interfere less
> with each other.
>
> Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> Cc: <linux-cifs@vger.kernel.org>
> Cc: <v9fs-developer@lists.sourceforge.net>
> ---
>  drivers/infiniband/core/cq.c             |   29 +++++++++++++++++++++++++++++
>  drivers/infiniband/ulp/srpt/ib_srpt.c    |    4 ++--
>  fs/cifs/smbdirect.c                      |   10 ++++++----
>  include/rdma/ib_verbs.h                  |   19 +++++++++++++++++++
>  net/9p/trans_rdma.c                      |    6 +++---
>  net/sunrpc/xprtrdma/svc_rdma_transport.c |    8 ++++----
>  net/sunrpc/xprtrdma/verbs.c              |   13 ++++++-------
>  7 files changed, 69 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> index 7c59987..ea3bb0d 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -253,6 +253,35 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>  EXPORT_SYMBOL(__ib_alloc_cq_user);
>
>  /**
> + * __ib_alloc_cq_any - allocate a completion queue
> + * @dev:		device to allocate the CQ for
> + * @private:		driver private data, accessible from cq->cq_context
> + * @nr_cqe:		number of CQEs to allocate
> + * @poll_ctx:		context to poll the CQ from.
> + * @caller:		module owner name.
> + *
> + * Attempt to spread ULP Completion Queues over each device's interrupt
> + * vectors.
> + */
> +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
> +				int nr_cqe, enum ib_poll_context poll_ctx,
> +				const char *caller)
> +{
> +	static atomic_t counter;
> +	int comp_vector;

int comp_vector = 0;

> +
> +	comp_vector = 0;

This assignment is better to be part of initialization.

> +	if (dev->num_comp_vectors > 1)
> +		comp_vector =
> +			atomic_inc_return(&counter) %

Don't we need manage "free list" of comp_vectors? Otherwise we can find
ourselves providing already "taken" comp_vector.

Just as an example:
dev->num_comp_vectors = 2
num_online_cpus = 4

The following combination will give us same comp_vector:
1. ib_alloc_cq_any()
2. ib_alloc_cq_any()
3. ib_free_cq()
4. goto 2

> +			min_t(int, dev->num_comp_vectors, num_online_cpus());
> +
> +	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
> +				  caller, NULL);
> +}
> +EXPORT_SYMBOL(__ib_alloc_cq_any);
> +
> +/**
>   * ib_free_cq_user - free a completion queue
>   * @cq:		completion queue to free.
>   * @udata:	User data or NULL for kernel object
> diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
> index 1a039f1..e25c70a 100644
> --- a/drivers/infiniband/ulp/srpt/ib_srpt.c
> +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
> @@ -1767,8 +1767,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
>  		goto out;
>
>  retry:
> -	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size,
> -			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
> +	ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
> +				 IB_POLL_WORKQUEUE);
>  	if (IS_ERR(ch->cq)) {
>  		ret = PTR_ERR(ch->cq);
>  		pr_err("failed to create CQ cqe= %d ret= %d\n",
> diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
> index cd07e53..3c91fa9 100644
> --- a/fs/cifs/smbdirect.c
> +++ b/fs/cifs/smbdirect.c
> @@ -1654,15 +1654,17 @@ static struct smbd_connection *_smbd_get_connection(
>
>  	info->send_cq = NULL;
>  	info->recv_cq = NULL;
> -	info->send_cq = ib_alloc_cq(info->id->device, info,
> -			info->send_credit_target, 0, IB_POLL_SOFTIRQ);
> +	info->send_cq =
> +		ib_alloc_cq_any(info->id->device, info,
> +				info->send_credit_target, IB_POLL_SOFTIRQ);
>  	if (IS_ERR(info->send_cq)) {
>  		info->send_cq = NULL;
>  		goto alloc_cq_failed;
>  	}
>
> -	info->recv_cq = ib_alloc_cq(info->id->device, info,
> -			info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
> +	info->recv_cq =
> +		ib_alloc_cq_any(info->id->device, info,
> +				info->receive_credit_max, IB_POLL_SOFTIRQ);
>  	if (IS_ERR(info->recv_cq)) {
>  		info->recv_cq = NULL;
>  		goto alloc_cq_failed;
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index c5f8a9f..2a1523cc 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -3711,6 +3711,25 @@ static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
>  				NULL);
>  }
>
> +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
> +				int nr_cqe, enum ib_poll_context poll_ctx,
> +				const char *caller);
> +
> +/**
> + * ib_alloc_cq_any: Allocate kernel CQ
> + * @dev: The IB device
> + * @private: Private data attached to the CQE
> + * @nr_cqe: Number of CQEs in the CQ
> + * @poll_ctx: Context used for polling the CQ
> + */
> +static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
> +					    void *private, int nr_cqe,
> +					    enum ib_poll_context poll_ctx)
> +{
> +	return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
> +				 KBUILD_MODNAME);
> +}

This should be macro and not inline function, because compiler can be
instructed do not inline functions (there is kconfig option for that)
and it will cause that wrong KBUILD_MODNAME will be inserted (ib_core
instead of ulp).

And yes, commit c4367a26357b ("IB: Pass uverbs_attr_bundle down ib_x
destroy path") did it completely wrong.

> +
>  /**
>   * ib_free_cq_user - Free kernel/user CQ
>   * @cq: The CQ to free
> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
> index bac8dad..b21c3c2 100644
> --- a/net/9p/trans_rdma.c
> +++ b/net/9p/trans_rdma.c
> @@ -685,9 +685,9 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
>  		goto error;
>
>  	/* Create the Completion Queue */
> -	rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
> -			opts.sq_depth + opts.rq_depth + 1,
> -			0, IB_POLL_SOFTIRQ);
> +	rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
> +				   opts.sq_depth + opts.rq_depth + 1,
> +				   IB_POLL_SOFTIRQ);
>  	if (IS_ERR(rdma->cq))
>  		goto error;
>
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index 3fe6651..4d3db6e 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -454,14 +454,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  		dprintk("svcrdma: error creating PD for connect request\n");
>  		goto errout;
>  	}
> -	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
> -					0, IB_POLL_WORKQUEUE);
> +	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
> +					    IB_POLL_WORKQUEUE);
>  	if (IS_ERR(newxprt->sc_sq_cq)) {
>  		dprintk("svcrdma: error creating SQ CQ for connect request\n");
>  		goto errout;
>  	}
> -	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
> -					0, IB_POLL_WORKQUEUE);
> +	newxprt->sc_rq_cq =
> +		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
>  	if (IS_ERR(newxprt->sc_rq_cq)) {
>  		dprintk("svcrdma: error creating RQ CQ for connect request\n");
>  		goto errout;
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 805b1f35..b10aa16 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -521,18 +521,17 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
>  	init_waitqueue_head(&ep->rep_connect_wait);
>  	ep->rep_receive_count = 0;
>
> -	sendcq = ib_alloc_cq(ia->ri_id->device, NULL,
> -			     ep->rep_attr.cap.max_send_wr + 1,
> -			     ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0,
> -			     IB_POLL_WORKQUEUE);
> +	sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
> +				 ep->rep_attr.cap.max_send_wr + 1,
> +				 IB_POLL_WORKQUEUE);
>  	if (IS_ERR(sendcq)) {
>  		rc = PTR_ERR(sendcq);
>  		goto out1;
>  	}
>
> -	recvcq = ib_alloc_cq(ia->ri_id->device, NULL,
> -			     ep->rep_attr.cap.max_recv_wr + 1,
> -			     0, IB_POLL_WORKQUEUE);
> +	recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
> +				 ep->rep_attr.cap.max_recv_wr + 1,
> +				 IB_POLL_WORKQUEUE);
>  	if (IS_ERR(recvcq)) {
>  		rc = PTR_ERR(recvcq);
>  		goto out2;
>
Jason Gunthorpe July 29, 2019, 2:19 p.m. UTC | #2
On Mon, Jul 29, 2019 at 08:49:33AM +0300, Leon Romanovsky wrote:
> > +/**
> > + * ib_alloc_cq_any: Allocate kernel CQ
> > + * @dev: The IB device
> > + * @private: Private data attached to the CQE
> > + * @nr_cqe: Number of CQEs in the CQ
> > + * @poll_ctx: Context used for polling the CQ
> > + */
> > +static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
> > +					    void *private, int nr_cqe,
> > +					    enum ib_poll_context poll_ctx)
> > +{
> > +	return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
> > +				 KBUILD_MODNAME);
> > +}
> 
> This should be macro and not inline function, because compiler can be
> instructed do not inline functions (there is kconfig option for that)
> and it will cause that wrong KBUILD_MODNAME will be inserted (ib_core
> instead of ulp).

No, it can't, a static inline can only be de-inlined within the same
compilation unit, so KBUILD_MODNAME can never be mixed up.

You only need to use macros of the value changes within the
compilation unit.

Jason
Chuck Lever July 29, 2019, 2:24 p.m. UTC | #3
> On Jul 29, 2019, at 1:49 AM, Leon Romanovsky <leon@kernel.org> wrote:
> 
> On Sun, Jul 28, 2019 at 12:30:27PM -0400, Chuck Lever wrote:
>> Send and Receive completion is handled on a single CPU selected at
>> the time each Completion Queue is allocated. Typically this is when
>> an initiator instantiates an RDMA transport, or when a target
>> accepts an RDMA connection.
>> 
>> Some ULPs cannot open a connection per CPU to spread completion
>> workload across available CPUs and MSI vectors. For such ULPs,
>> provide an API that allows the RDMA core to select a completion
>> vector based on the device's complement of available comp_vecs.
>> 
>> ULPs that invoke ib_alloc_cq() with only comp_vector 0 are converted
>> to use the new API so that their completion workloads interfere less
>> with each other.
>> 
>> Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> Cc: <linux-cifs@vger.kernel.org>
>> Cc: <v9fs-developer@lists.sourceforge.net>
>> ---
>> drivers/infiniband/core/cq.c             |   29 +++++++++++++++++++++++++++++
>> drivers/infiniband/ulp/srpt/ib_srpt.c    |    4 ++--
>> fs/cifs/smbdirect.c                      |   10 ++++++----
>> include/rdma/ib_verbs.h                  |   19 +++++++++++++++++++
>> net/9p/trans_rdma.c                      |    6 +++---
>> net/sunrpc/xprtrdma/svc_rdma_transport.c |    8 ++++----
>> net/sunrpc/xprtrdma/verbs.c              |   13 ++++++-------
>> 7 files changed, 69 insertions(+), 20 deletions(-)
>> 
>> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
>> index 7c59987..ea3bb0d 100644
>> --- a/drivers/infiniband/core/cq.c
>> +++ b/drivers/infiniband/core/cq.c
>> @@ -253,6 +253,35 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
>> EXPORT_SYMBOL(__ib_alloc_cq_user);
>> 
>> /**
>> + * __ib_alloc_cq_any - allocate a completion queue
>> + * @dev:		device to allocate the CQ for
>> + * @private:		driver private data, accessible from cq->cq_context
>> + * @nr_cqe:		number of CQEs to allocate
>> + * @poll_ctx:		context to poll the CQ from.
>> + * @caller:		module owner name.
>> + *
>> + * Attempt to spread ULP Completion Queues over each device's interrupt
>> + * vectors.
>> + */
>> +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
>> +				int nr_cqe, enum ib_poll_context poll_ctx,
>> +				const char *caller)
>> +{
>> +	static atomic_t counter;
>> +	int comp_vector;
> 
> int comp_vector = 0;
> 
>> +
>> +	comp_vector = 0;
> 
> This assignment is better to be part of initialization.
> 
>> +	if (dev->num_comp_vectors > 1)
>> +		comp_vector =
>> +			atomic_inc_return(&counter) %
> 
> Don't we need manage "free list" of comp_vectors? Otherwise we can find
> ourselves providing already "taken" comp_vector.

Many ULPs use only comp_vector 0 today. It is obviously harmless
to have more than one ULP using the same comp_vector.

The point of this patch is best effort spreading. This algorithm
has been proposed repeatedly for several years on this list, and
each time the consensus has been this is simple and good enough.


> Just as an example:
> dev->num_comp_vectors = 2
> num_online_cpus = 4
> 
> The following combination will give us same comp_vector:
> 1. ib_alloc_cq_any()
> 2. ib_alloc_cq_any()
> 3. ib_free_cq()
> 4. goto 2
> 
>> +			min_t(int, dev->num_comp_vectors, num_online_cpus());
>> +
>> +	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
>> +				  caller, NULL);
>> +}
>> +EXPORT_SYMBOL(__ib_alloc_cq_any);
>> +
>> +/**
>>  * ib_free_cq_user - free a completion queue
>>  * @cq:		completion queue to free.
>>  * @udata:	User data or NULL for kernel object
>> diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
>> index 1a039f1..e25c70a 100644
>> --- a/drivers/infiniband/ulp/srpt/ib_srpt.c
>> +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
>> @@ -1767,8 +1767,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
>> 		goto out;
>> 
>> retry:
>> -	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size,
>> -			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
>> +	ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
>> +				 IB_POLL_WORKQUEUE);
>> 	if (IS_ERR(ch->cq)) {
>> 		ret = PTR_ERR(ch->cq);
>> 		pr_err("failed to create CQ cqe= %d ret= %d\n",
>> diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
>> index cd07e53..3c91fa9 100644
>> --- a/fs/cifs/smbdirect.c
>> +++ b/fs/cifs/smbdirect.c
>> @@ -1654,15 +1654,17 @@ static struct smbd_connection *_smbd_get_connection(
>> 
>> 	info->send_cq = NULL;
>> 	info->recv_cq = NULL;
>> -	info->send_cq = ib_alloc_cq(info->id->device, info,
>> -			info->send_credit_target, 0, IB_POLL_SOFTIRQ);
>> +	info->send_cq =
>> +		ib_alloc_cq_any(info->id->device, info,
>> +				info->send_credit_target, IB_POLL_SOFTIRQ);
>> 	if (IS_ERR(info->send_cq)) {
>> 		info->send_cq = NULL;
>> 		goto alloc_cq_failed;
>> 	}
>> 
>> -	info->recv_cq = ib_alloc_cq(info->id->device, info,
>> -			info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
>> +	info->recv_cq =
>> +		ib_alloc_cq_any(info->id->device, info,
>> +				info->receive_credit_max, IB_POLL_SOFTIRQ);
>> 	if (IS_ERR(info->recv_cq)) {
>> 		info->recv_cq = NULL;
>> 		goto alloc_cq_failed;
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index c5f8a9f..2a1523cc 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -3711,6 +3711,25 @@ static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
>> 				NULL);
>> }
>> 
>> +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
>> +				int nr_cqe, enum ib_poll_context poll_ctx,
>> +				const char *caller);
>> +
>> +/**
>> + * ib_alloc_cq_any: Allocate kernel CQ
>> + * @dev: The IB device
>> + * @private: Private data attached to the CQE
>> + * @nr_cqe: Number of CQEs in the CQ
>> + * @poll_ctx: Context used for polling the CQ
>> + */
>> +static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
>> +					    void *private, int nr_cqe,
>> +					    enum ib_poll_context poll_ctx)
>> +{
>> +	return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
>> +				 KBUILD_MODNAME);
>> +}
> 
> This should be macro and not inline function, because compiler can be
> instructed do not inline functions (there is kconfig option for that)
> and it will cause that wrong KBUILD_MODNAME will be inserted (ib_core
> instead of ulp).
> 
> And yes, commit c4367a26357b ("IB: Pass uverbs_attr_bundle down ib_x
> destroy path") did it completely wrong.

My understanding is the same as Jason's.


>> +
>> /**
>>  * ib_free_cq_user - Free kernel/user CQ
>>  * @cq: The CQ to free
>> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
>> index bac8dad..b21c3c2 100644
>> --- a/net/9p/trans_rdma.c
>> +++ b/net/9p/trans_rdma.c
>> @@ -685,9 +685,9 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
>> 		goto error;
>> 
>> 	/* Create the Completion Queue */
>> -	rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
>> -			opts.sq_depth + opts.rq_depth + 1,
>> -			0, IB_POLL_SOFTIRQ);
>> +	rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
>> +				   opts.sq_depth + opts.rq_depth + 1,
>> +				   IB_POLL_SOFTIRQ);
>> 	if (IS_ERR(rdma->cq))
>> 		goto error;
>> 
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> index 3fe6651..4d3db6e 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> @@ -454,14 +454,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>> 		dprintk("svcrdma: error creating PD for connect request\n");
>> 		goto errout;
>> 	}
>> -	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
>> -					0, IB_POLL_WORKQUEUE);
>> +	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
>> +					    IB_POLL_WORKQUEUE);
>> 	if (IS_ERR(newxprt->sc_sq_cq)) {
>> 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
>> 		goto errout;
>> 	}
>> -	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
>> -					0, IB_POLL_WORKQUEUE);
>> +	newxprt->sc_rq_cq =
>> +		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
>> 	if (IS_ERR(newxprt->sc_rq_cq)) {
>> 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
>> 		goto errout;
>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
>> index 805b1f35..b10aa16 100644
>> --- a/net/sunrpc/xprtrdma/verbs.c
>> +++ b/net/sunrpc/xprtrdma/verbs.c
>> @@ -521,18 +521,17 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
>> 	init_waitqueue_head(&ep->rep_connect_wait);
>> 	ep->rep_receive_count = 0;
>> 
>> -	sendcq = ib_alloc_cq(ia->ri_id->device, NULL,
>> -			     ep->rep_attr.cap.max_send_wr + 1,
>> -			     ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0,
>> -			     IB_POLL_WORKQUEUE);
>> +	sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
>> +				 ep->rep_attr.cap.max_send_wr + 1,
>> +				 IB_POLL_WORKQUEUE);
>> 	if (IS_ERR(sendcq)) {
>> 		rc = PTR_ERR(sendcq);
>> 		goto out1;
>> 	}
>> 
>> -	recvcq = ib_alloc_cq(ia->ri_id->device, NULL,
>> -			     ep->rep_attr.cap.max_recv_wr + 1,
>> -			     0, IB_POLL_WORKQUEUE);
>> +	recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
>> +				 ep->rep_attr.cap.max_recv_wr + 1,
>> +				 IB_POLL_WORKQUEUE);
>> 	if (IS_ERR(recvcq)) {
>> 		rc = PTR_ERR(recvcq);
>> 		goto out2;

--
Chuck Lever
Leon Romanovsky July 29, 2019, 5:10 p.m. UTC | #4
On Mon, Jul 29, 2019 at 11:19:45AM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 29, 2019 at 08:49:33AM +0300, Leon Romanovsky wrote:
> > > +/**
> > > + * ib_alloc_cq_any: Allocate kernel CQ
> > > + * @dev: The IB device
> > > + * @private: Private data attached to the CQE
> > > + * @nr_cqe: Number of CQEs in the CQ
> > > + * @poll_ctx: Context used for polling the CQ
> > > + */
> > > +static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
> > > +					    void *private, int nr_cqe,
> > > +					    enum ib_poll_context poll_ctx)
> > > +{
> > > +	return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
> > > +				 KBUILD_MODNAME);
> > > +}
> >
> > This should be macro and not inline function, because compiler can be
> > instructed do not inline functions (there is kconfig option for that)
> > and it will cause that wrong KBUILD_MODNAME will be inserted (ib_core
> > instead of ulp).
>
> No, it can't, a static inline can only be de-inlined within the same
> compilation unit, so KBUILD_MODNAME can never be mixed up.
>
> You only need to use macros of the value changes within the
> compilation unit.

Thanks, good to know.

>
> Jason
Leon Romanovsky July 29, 2019, 5:12 p.m. UTC | #5
On Mon, Jul 29, 2019 at 10:24:12AM -0400, Chuck Lever wrote:
>
>
> > On Jul 29, 2019, at 1:49 AM, Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Sun, Jul 28, 2019 at 12:30:27PM -0400, Chuck Lever wrote:
> >> Send and Receive completion is handled on a single CPU selected at
> >> the time each Completion Queue is allocated. Typically this is when
> >> an initiator instantiates an RDMA transport, or when a target
> >> accepts an RDMA connection.
> >>
> >> Some ULPs cannot open a connection per CPU to spread completion
> >> workload across available CPUs and MSI vectors. For such ULPs,
> >> provide an API that allows the RDMA core to select a completion
> >> vector based on the device's complement of available comp_vecs.
> >>
> >> ULPs that invoke ib_alloc_cq() with only comp_vector 0 are converted
> >> to use the new API so that their completion workloads interfere less
> >> with each other.
> >>
> >> Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> Cc: <linux-cifs@vger.kernel.org>
> >> Cc: <v9fs-developer@lists.sourceforge.net>
> >> ---
> >> drivers/infiniband/core/cq.c             |   29 +++++++++++++++++++++++++++++
> >> drivers/infiniband/ulp/srpt/ib_srpt.c    |    4 ++--
> >> fs/cifs/smbdirect.c                      |   10 ++++++----
> >> include/rdma/ib_verbs.h                  |   19 +++++++++++++++++++
> >> net/9p/trans_rdma.c                      |    6 +++---
> >> net/sunrpc/xprtrdma/svc_rdma_transport.c |    8 ++++----
> >> net/sunrpc/xprtrdma/verbs.c              |   13 ++++++-------
> >> 7 files changed, 69 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> >> index 7c59987..ea3bb0d 100644
> >> --- a/drivers/infiniband/core/cq.c
> >> +++ b/drivers/infiniband/core/cq.c
> >> @@ -253,6 +253,35 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
> >> EXPORT_SYMBOL(__ib_alloc_cq_user);
> >>
> >> /**
> >> + * __ib_alloc_cq_any - allocate a completion queue
> >> + * @dev:		device to allocate the CQ for
> >> + * @private:		driver private data, accessible from cq->cq_context
> >> + * @nr_cqe:		number of CQEs to allocate
> >> + * @poll_ctx:		context to poll the CQ from.
> >> + * @caller:		module owner name.
> >> + *
> >> + * Attempt to spread ULP Completion Queues over each device's interrupt
> >> + * vectors.
> >> + */
> >> +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
> >> +				int nr_cqe, enum ib_poll_context poll_ctx,
> >> +				const char *caller)
> >> +{
> >> +	static atomic_t counter;
> >> +	int comp_vector;
> >
> > int comp_vector = 0;
> >
> >> +
> >> +	comp_vector = 0;
> >
> > This assignment is better to be part of initialization.
> >
> >> +	if (dev->num_comp_vectors > 1)
> >> +		comp_vector =
> >> +			atomic_inc_return(&counter) %
> >
> > Don't we need manage "free list" of comp_vectors? Otherwise we can find
> > ourselves providing already "taken" comp_vector.
>
> Many ULPs use only comp_vector 0 today. It is obviously harmless
> to have more than one ULP using the same comp_vector.
>
> The point of this patch is best effort spreading. This algorithm
> has been proposed repeatedly for several years on this list, and
> each time the consensus has been this is simple and good enough.

Agree, it is better than current implementation.

Reviewed-by: Leon Romanovsky <leonro@mellanox.com>

Thanks

Patch
diff mbox series

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 7c59987..ea3bb0d 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -253,6 +253,35 @@  struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 EXPORT_SYMBOL(__ib_alloc_cq_user);
 
 /**
+ * __ib_alloc_cq_any - allocate a completion queue
+ * @dev:		device to allocate the CQ for
+ * @private:		driver private data, accessible from cq->cq_context
+ * @nr_cqe:		number of CQEs to allocate
+ * @poll_ctx:		context to poll the CQ from.
+ * @caller:		module owner name.
+ *
+ * Attempt to spread ULP Completion Queues over each device's interrupt
+ * vectors.
+ */
+struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
+				int nr_cqe, enum ib_poll_context poll_ctx,
+				const char *caller)
+{
+	static atomic_t counter;
+	int comp_vector;
+
+	comp_vector = 0;
+	if (dev->num_comp_vectors > 1)
+		comp_vector =
+			atomic_inc_return(&counter) %
+			min_t(int, dev->num_comp_vectors, num_online_cpus());
+
+	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+				  caller, NULL);
+}
+EXPORT_SYMBOL(__ib_alloc_cq_any);
+
+/**
  * ib_free_cq_user - free a completion queue
  * @cq:		completion queue to free.
  * @udata:	User data or NULL for kernel object
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 1a039f1..e25c70a 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1767,8 +1767,8 @@  static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 		goto out;
 
 retry:
-	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size,
-			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
+	ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
+				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
 		pr_err("failed to create CQ cqe= %d ret= %d\n",
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index cd07e53..3c91fa9 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1654,15 +1654,17 @@  static struct smbd_connection *_smbd_get_connection(
 
 	info->send_cq = NULL;
 	info->recv_cq = NULL;
-	info->send_cq = ib_alloc_cq(info->id->device, info,
-			info->send_credit_target, 0, IB_POLL_SOFTIRQ);
+	info->send_cq =
+		ib_alloc_cq_any(info->id->device, info,
+				info->send_credit_target, IB_POLL_SOFTIRQ);
 	if (IS_ERR(info->send_cq)) {
 		info->send_cq = NULL;
 		goto alloc_cq_failed;
 	}
 
-	info->recv_cq = ib_alloc_cq(info->id->device, info,
-			info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
+	info->recv_cq =
+		ib_alloc_cq_any(info->id->device, info,
+				info->receive_credit_max, IB_POLL_SOFTIRQ);
 	if (IS_ERR(info->recv_cq)) {
 		info->recv_cq = NULL;
 		goto alloc_cq_failed;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c5f8a9f..2a1523cc 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3711,6 +3711,25 @@  static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 				NULL);
 }
 
+struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
+				int nr_cqe, enum ib_poll_context poll_ctx,
+				const char *caller);
+
+/**
+ * ib_alloc_cq_any: Allocate kernel CQ
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @poll_ctx: Context used for polling the CQ
+ */
+static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
+					    void *private, int nr_cqe,
+					    enum ib_poll_context poll_ctx)
+{
+	return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
+				 KBUILD_MODNAME);
+}
+
 /**
  * ib_free_cq_user - Free kernel/user CQ
  * @cq: The CQ to free
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index bac8dad..b21c3c2 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -685,9 +685,9 @@  static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
 		goto error;
 
 	/* Create the Completion Queue */
-	rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
-			opts.sq_depth + opts.rq_depth + 1,
-			0, IB_POLL_SOFTIRQ);
+	rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
+				   opts.sq_depth + opts.rq_depth + 1,
+				   IB_POLL_SOFTIRQ);
 	if (IS_ERR(rdma->cq))
 		goto error;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3fe6651..4d3db6e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -454,14 +454,14 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		dprintk("svcrdma: error creating PD for connect request\n");
 		goto errout;
 	}
-	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-					0, IB_POLL_WORKQUEUE);
+	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+					    IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_sq_cq)) {
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
-	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
-					0, IB_POLL_WORKQUEUE);
+	newxprt->sc_rq_cq =
+		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq)) {
 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
 		goto errout;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 805b1f35..b10aa16 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -521,18 +521,17 @@  int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	init_waitqueue_head(&ep->rep_connect_wait);
 	ep->rep_receive_count = 0;
 
-	sendcq = ib_alloc_cq(ia->ri_id->device, NULL,
-			     ep->rep_attr.cap.max_send_wr + 1,
-			     ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0,
-			     IB_POLL_WORKQUEUE);
+	sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+				 ep->rep_attr.cap.max_send_wr + 1,
+				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(sendcq)) {
 		rc = PTR_ERR(sendcq);
 		goto out1;
 	}
 
-	recvcq = ib_alloc_cq(ia->ri_id->device, NULL,
-			     ep->rep_attr.cap.max_recv_wr + 1,
-			     0, IB_POLL_WORKQUEUE);
+	recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+				 ep->rep_attr.cap.max_recv_wr + 1,
+				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(recvcq)) {
 		rc = PTR_ERR(recvcq);
 		goto out2;