diff mbox

[v10,08/22] IB/hns: Add icm support

Message ID 1466087730-54856-9-git-send-email-oulijun@huawei.com
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

oulijun June 16, 2016, 2:35 p.m. UTC
This patch mainly added icm support for RoCE. It initializes icm
which managers the relative memory blocks for RoCE. The data
structures of RoCE will be located in it. For example, CQ table,
QP table and MTPT table so on.

Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
PATCH v9/v8/v7/v6:
- No change over the PATCH v5

PATCH v5:
- The initial patch which was redesigned based on the second patch
  in PATCH v4
---
---
 drivers/infiniband/hw/hns/hns_roce_common.h |  19 ++
 drivers/infiniband/hw/hns/hns_roce_device.h |  30 ++
 drivers/infiniband/hw/hns/hns_roce_icm.c    | 460 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_icm.h    | 119 +++++++
 drivers/infiniband/hw/hns/hns_roce_main.c   |  84 +++++
 5 files changed, 712 insertions(+)
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_icm.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_icm.h

Comments

Leon Romanovsky June 17, 2016, 9:58 a.m. UTC | #1
On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
> This patch mainly added icm support for RoCE. It initializes icm
> which managers the relative memory blocks for RoCE. The data
> structures of RoCE will be located in it. For example, CQ table,
> QP table and MTPT table so on.
> 
> Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
> Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
> Signed-off-by: Lijun Ou <oulijun@huawei.com>
> ---

<...>

> +
> +static int hns_roce_alloc_icm_pages(struct scatterlist *mem, int order,
> +				    gfp_t gfp_mask)
> +{
> +	struct page *page;
> +
> +	page = alloc_pages(gfp_mask, order);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	sg_set_page(mem, page, PAGE_SIZE << order, 0);
> +
> +	return 0;
> +}
> +
> +static int hns_roce_alloc_icm_coherent(struct device *dev,
> +				       struct scatterlist *mem, int order,
> +				       gfp_t gfp_mask)
> +{
> +	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
> +				       &sg_dma_address(mem), gfp_mask);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	sg_set_buf(mem, buf, PAGE_SIZE << order);
> +	WARN_ON(mem->offset);
> +	sg_dma_len(mem) = PAGE_SIZE << order;
> +	return 0;
> +}
> +

<...>

> +
> +static void hns_roce_free_icm_pages(struct hns_roce_dev *hr_dev,
> +				    struct hns_roce_icm_chunk *chunk)
> +{
> +	int i;
> +
> +	if (chunk->nsg > 0)
> +		dma_unmap_sg(&hr_dev->pdev->dev, chunk->mem, chunk->npages,
> +			     DMA_BIDIRECTIONAL);
> +
> +	for (i = 0; i < chunk->npages; ++i)
> +		__free_pages(sg_page(&chunk->mem[i]),
> +			     get_order(chunk->mem[i].length));

You used alloc_pages for this allocation, so why are you using
__free_pages instead of free_pages?
Leon Romanovsky June 20, 2016, 6:06 a.m. UTC | #2
On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
> 
> 
> On 2016/6/17 17:58, Leon Romanovsky wrote:
> >On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
> >>This patch mainly added icm support for RoCE. It initializes icm
> >>which managers the relative memory blocks for RoCE. The data
> >>structures of RoCE will be located in it. For example, CQ table,
> >>QP table and MTPT table so on.
> >>
> >>Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
> >>Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
> >>Signed-off-by: Lijun Ou <oulijun@huawei.com>
> >>---
> ><...>
> >
> >>+
> >>+static int hns_roce_alloc_icm_pages(struct scatterlist *mem, int order,
> >>+				    gfp_t gfp_mask)
> >>+{
> >>+	struct page *page;
> >>+
> >>+	page = alloc_pages(gfp_mask, order);
> >>+	if (!page)
> >>+		return -ENOMEM;
> >>+
> >>+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
> >>+
> >>+	return 0;
> >>+}
> >>+
> >>+static int hns_roce_alloc_icm_coherent(struct device *dev,
> >>+				       struct scatterlist *mem, int order,
> >>+				       gfp_t gfp_mask)
> >>+{
> >>+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
> >>+				       &sg_dma_address(mem), gfp_mask);
> >>+	if (!buf)
> >>+		return -ENOMEM;
> >>+
> >>+	sg_set_buf(mem, buf, PAGE_SIZE << order);
> >>+	WARN_ON(mem->offset);
> >>+	sg_dma_len(mem) = PAGE_SIZE << order;
> >>+	return 0;
> >>+}
> >>+
> ><...>
> >
> >>+
> >>+static void hns_roce_free_icm_pages(struct hns_roce_dev *hr_dev,
> >>+				    struct hns_roce_icm_chunk *chunk)
> >>+{
> >>+	int i;
> >>+
> >>+	if (chunk->nsg > 0)
> >>+		dma_unmap_sg(&hr_dev->pdev->dev, chunk->mem, chunk->npages,
> >>+			     DMA_BIDIRECTIONAL);
> >>+
> >>+	for (i = 0; i < chunk->npages; ++i)
> >>+		__free_pages(sg_page(&chunk->mem[i]),
> >>+			     get_order(chunk->mem[i].length));
> >You used alloc_pages for this allocation, so why are you using
> >__free_pages instead of free_pages?
> Hi, Leon
>     The function prototype of these functions as below:
>         static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int
> order);
>         void free_pages(unsigned long addr, unsigned int order);
>         void __free_pages(struct page *page, unsigned int order);
> 
>     The type of the first parameter of free_pages is same with the type of
> return value of alloc_pages.
>     Maybe it is better to call __free_pages to release memory that allocated
> by calling alloc_pages.

OK, I see.

Another question which you didn't answer [1].

"I wonder if you have the same needs for ICM as it is in mlx4 device.
Do you have firmware?"

[1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2

> 
> Regards
> Wei Hu
> 
> >
>
Wei Hu(Xavier) June 20, 2016, 7:49 a.m. UTC | #3
On 2016/6/20 14:06, Leon Romanovsky wrote:
> On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
>>
>> On 2016/6/17 17:58, Leon Romanovsky wrote:
>>> On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
>>>> This patch mainly added icm support for RoCE. It initializes icm
>>>> which managers the relative memory blocks for RoCE. The data
>>>> structures of RoCE will be located in it. For example, CQ table,
>>>> QP table and MTPT table so on.
>>>>
>>>> Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
>>>> Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
>>>> Signed-off-by: Lijun Ou <oulijun@huawei.com>
>>>> ---
>>> <...>
>>>
>>>> +
>>>> +static int hns_roce_alloc_icm_pages(struct scatterlist *mem, int order,
>>>> +				    gfp_t gfp_mask)
>>>> +{
>>>> +	struct page *page;
>>>> +
>>>> +	page = alloc_pages(gfp_mask, order);
>>>> +	if (!page)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	sg_set_page(mem, page, PAGE_SIZE << order, 0);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int hns_roce_alloc_icm_coherent(struct device *dev,
>>>> +				       struct scatterlist *mem, int order,
>>>> +				       gfp_t gfp_mask)
>>>> +{
>>>> +	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
>>>> +				       &sg_dma_address(mem), gfp_mask);
>>>> +	if (!buf)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	sg_set_buf(mem, buf, PAGE_SIZE << order);
>>>> +	WARN_ON(mem->offset);
>>>> +	sg_dma_len(mem) = PAGE_SIZE << order;
>>>> +	return 0;
>>>> +}
>>>> +
>>> <...>
>>>
>>>> +
>>>> +static void hns_roce_free_icm_pages(struct hns_roce_dev *hr_dev,
>>>> +				    struct hns_roce_icm_chunk *chunk)
>>>> +{
>>>> +	int i;
>>>> +
>>>> +	if (chunk->nsg > 0)
>>>> +		dma_unmap_sg(&hr_dev->pdev->dev, chunk->mem, chunk->npages,
>>>> +			     DMA_BIDIRECTIONAL);
>>>> +
>>>> +	for (i = 0; i < chunk->npages; ++i)
>>>> +		__free_pages(sg_page(&chunk->mem[i]),
>>>> +			     get_order(chunk->mem[i].length));
>>> You used alloc_pages for this allocation, so why are you using
>>> __free_pages instead of free_pages?
>> Hi, Leon
>>      The function prototype of these functions as below:
>>          static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int
>> order);
>>          void free_pages(unsigned long addr, unsigned int order);
>>          void __free_pages(struct page *page, unsigned int order);
>>
>>      The type of the first parameter of free_pages is same with the type of
>> return value of alloc_pages.
>>      Maybe it is better to call __free_pages to release memory that allocated
>> by calling alloc_pages.
> OK, I see.
>
> Another question which you didn't answer [1].
>
> "I wonder if you have the same needs for ICM as it is in mlx4 device.
> Do you have firmware?"
>
> [1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
Hi, Leon
     Now we haven't firmware.
     But hardware still need memory for QPC\CQC\MTPT\mtt etc.

Thanks
Wei Hu
>> Regards
>> Wei Hu
>>
Leon Romanovsky June 20, 2016, 9:27 a.m. UTC | #4
On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
> 
> 
> On 2016/6/20 14:06, Leon Romanovsky wrote:
> >On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
> >>
> >>On 2016/6/17 17:58, Leon Romanovsky wrote:
> >>>On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
> >>>>This patch mainly added icm support for RoCE. It initializes icm
> >>>>which managers the relative memory blocks for RoCE. The data
> >>>>structures of RoCE will be located in it. For example, CQ table,
> >>>>QP table and MTPT table so on.
> >>>>
> >>>>Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
> >>>>Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
> >>>>Signed-off-by: Lijun Ou <oulijun@huawei.com>
> >>>>---
> >>><...>
> >>>
> >>>>+
> >Another question which you didn't answer [1].
> >
> >"I wonder if you have the same needs for ICM as it is in mlx4 device.
> >Do you have firmware?"
> >
> >[1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
> Hi, Leon
>     Now we haven't firmware.
>     But hardware still need memory for QPC\CQC\MTPT\mtt etc.

ICM stands for InfiniHost (Interconnect) Context Memory is a specific
memory place to share between host <-> FW and host <-> HW if HW is
aware of specific structures.

I assume that in your case, it is enough to allocate memory region and
supply it to HW. Am I right?

> 
> Thanks
> Wei Hu
> >>Regards
> >>Wei Hu
> >>
> 
>
Wei Hu(Xavier) June 20, 2016, 9:48 a.m. UTC | #5
On 2016/6/20 17:27, Leon Romanovsky wrote:
> On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
>>
>> On 2016/6/20 14:06, Leon Romanovsky wrote:
>>> On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
>>>> On 2016/6/17 17:58, Leon Romanovsky wrote:
>>>>> On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
>>>>>> This patch mainly added icm support for RoCE. It initializes icm
>>>>>> which managers the relative memory blocks for RoCE. The data
>>>>>> structures of RoCE will be located in it. For example, CQ table,
>>>>>> QP table and MTPT table so on.
>>>>>>
>>>>>> Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
>>>>>> Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
>>>>>> Signed-off-by: Lijun Ou <oulijun@huawei.com>
>>>>>> ---
>>>>> <...>
>>>>>
>>>>>> +
>>> Another question which you didn't answer [1].
>>>
>>> "I wonder if you have the same needs for ICM as it is in mlx4 device.
>>> Do you have firmware?"
>>>
>>> [1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
>> Hi, Leon
>>      Now we haven't firmware.
>>      But hardware still need memory for QPC\CQC\MTPT\mtt etc.
> ICM stands for InfiniHost (Interconnect) Context Memory is a specific
> memory place to share between host <-> FW and host <-> HW if HW is
> aware of specific structures.
>
> I assume that in your case, it is enough to allocate memory region and
> supply it to HW. Am I right?
For Our hardware,
1. ICM has a memory management method, It's very good for 
QPC\CQC\MTPT\mtt etc. we need it.
2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and 
driver, we don't want use MR.
3. Now we haven't firmware, maybe we need it next version.

Thanks
Wei Hu.
>> Thanks
>> Wei Hu
>>>> Regards
>>>> Wei Hu
>>>>
>>
Leon Romanovsky June 20, 2016, 1:04 p.m. UTC | #6
On Mon, Jun 20, 2016 at 05:48:15PM +0800, Wei Hu (Xavier) wrote:
> 
> 
> On 2016/6/20 17:27, Leon Romanovsky wrote:
> >On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
> >>
> >>On 2016/6/20 14:06, Leon Romanovsky wrote:
> >>>On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
> >>>>On 2016/6/17 17:58, Leon Romanovsky wrote:
> >>>>>On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
> >>>>>>This patch mainly added icm support for RoCE. It initializes icm
> >>>>>>which managers the relative memory blocks for RoCE. The data
> >>>>>>structures of RoCE will be located in it. For example, CQ table,
> >>>>>>QP table and MTPT table so on.
> >>>>>>
> >>>>>>Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
> >>>>>>Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
> >>>>>>Signed-off-by: Lijun Ou <oulijun@huawei.com>
> >>>>>>---
> >>>>><...>
> >>>>>
> >>>>>>+
> >>>Another question which you didn't answer [1].
> >>>
> >>>"I wonder if you have the same needs for ICM as it is in mlx4 device.
> >>>Do you have firmware?"
> >>>
> >>>[1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
> >>Hi, Leon
> >>     Now we haven't firmware.
> >>     But hardware still need memory for QPC\CQC\MTPT\mtt etc.
> >ICM stands for InfiniHost (Interconnect) Context Memory is a specific
> >memory place to share between host <-> FW and host <-> HW if HW is
> >aware of specific structures.
> >
> >I assume that in your case, it is enough to allocate memory region and
> >supply it to HW. Am I right?
> For Our hardware,
> 1. ICM has a memory management method, It's very good for QPC\CQC\MTPT\mtt
> etc. we need it.

You need special HW to leverage its. AFAIK it is Mellanox specific.

> 2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and driver,
> we don't want use MR.

I didn't mean Infiniband MR, but memory region returned from standard
allocation functions (kmalloc, ...).

> 3. Now we haven't firmware, maybe we need it next version.

You are always invited to add support once it will be needed, no need to
add it in advance.

Thanks
Wei Hu(Xavier) June 21, 2016, 4:37 a.m. UTC | #7
On 2016/6/20 21:04, Leon Romanovsky wrote:
> On Mon, Jun 20, 2016 at 05:48:15PM +0800, Wei Hu (Xavier) wrote:
>>
>> On 2016/6/20 17:27, Leon Romanovsky wrote:
>>> On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
>>>> On 2016/6/20 14:06, Leon Romanovsky wrote:
>>>>> On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
>>>>>> On 2016/6/17 17:58, Leon Romanovsky wrote:
>>>>>>> On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
>>>>>>>> This patch mainly added icm support for RoCE. It initializes icm
>>>>>>>> which managers the relative memory blocks for RoCE. The data
>>>>>>>> structures of RoCE will be located in it. For example, CQ table,
>>>>>>>> QP table and MTPT table so on.
>>>>>>>>
>>>>>>>> Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
>>>>>>>> Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
>>>>>>>> Signed-off-by: Lijun Ou <oulijun@huawei.com>
>>>>>>>> ---
>>>>>>> <...>
>>>>>>>
>>>>>>>> +
>>>>> Another question which you didn't answer [1].
>>>>>
>>>>> "I wonder if you have the same needs for ICM as it is in mlx4 device.
>>>>> Do you have firmware?"
>>>>>
>>>>> [1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
>>>> Hi, Leon
>>>>      Now we haven't firmware.
>>>>      But hardware still need memory for QPC\CQC\MTPT\mtt etc.
>>> ICM stands for InfiniHost (Interconnect) Context Memory is a specific
>>> memory place to share between host <-> FW and host <-> HW if HW is
>>> aware of specific structures.
>>>
>>> I assume that in your case, it is enough to allocate memory region and
>>> supply it to HW. Am I right?
>> For Our hardware,
>> 1. ICM has a memory management method, It's very good for QPC\CQC\MTPT\mtt
>> etc. we need it.
> You need special HW to leverage its. AFAIK it is Mellanox specific.
For our hardware, we use ICM to memory management, the memory shared 
with host and HW.
QPC\CQC\MTPT\mtt has specific memory requirement.
QPC\CQC\MTPT need continuous memory. we use ICM to management the block 
of memory. It's very good!
>> 2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and driver,
>> we don't want use MR.
> I didn't mean Infiniband MR, but memory region returned from standard
> allocation functions (kmalloc, ...).
>
>> 3. Now we haven't firmware, maybe we need it next version.
> You are always invited to add support once it will be needed, no need to
> add it in advance.
>
> Thanks
Leon Romanovsky June 21, 2016, 11:55 a.m. UTC | #8
On Tue, Jun 21, 2016 at 12:37:39PM +0800, Wei Hu (Xavier) wrote:
> 
> 
> On 2016/6/20 21:04, Leon Romanovsky wrote:
> >On Mon, Jun 20, 2016 at 05:48:15PM +0800, Wei Hu (Xavier) wrote:
> >>
> >>On 2016/6/20 17:27, Leon Romanovsky wrote:
> >>>On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
> >>>>On 2016/6/20 14:06, Leon Romanovsky wrote:
> >>>>>On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
> >>>>>>On 2016/6/17 17:58, Leon Romanovsky wrote:
> >>>>>>>On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
> >>>>>>>>This patch mainly added icm support for RoCE. It initializes icm
> >>>>>>>>which managers the relative memory blocks for RoCE. The data
> >>>>>>>>structures of RoCE will be located in it. For example, CQ table,
> >>>>>>>>QP table and MTPT table so on.
> >>>>>>>>
> >>>>>>>>Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
> >>>>>>>>Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
> >>>>>>>>Signed-off-by: Lijun Ou <oulijun@huawei.com>
> >>>>>>>>---
> >>>>>>><...>
> >>>>>>>
> >>>>>>>>+
> >>>>>Another question which you didn't answer [1].
> >>>>>
> >>>>>"I wonder if you have the same needs for ICM as it is in mlx4 device.
> >>>>>Do you have firmware?"
> >>>>>
> >>>>>[1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
> >>>>Hi, Leon
> >>>>     Now we haven't firmware.
> >>>>     But hardware still need memory for QPC\CQC\MTPT\mtt etc.
> >>>ICM stands for InfiniHost (Interconnect) Context Memory is a specific
> >>>memory place to share between host <-> FW and host <-> HW if HW is
> >>>aware of specific structures.
> >>>
> >>>I assume that in your case, it is enough to allocate memory region and
> >>>supply it to HW. Am I right?
> >>For Our hardware,
> >>1. ICM has a memory management method, It's very good for QPC\CQC\MTPT\mtt
> >>etc. we need it.
> >You need special HW to leverage its. AFAIK it is Mellanox specific.
> For our hardware, we use ICM to memory management, the memory shared with
> host and HW.
> QPC\CQC\MTPT\mtt has specific memory requirement.
> QPC\CQC\MTPT need continuous memory. we use ICM to management the block of
> memory. It's very good!

I wasn't convinced why do you need to copy whole ICM logic which is
specific to Mellanox. Your requirements can be implemented by standard CMA
and/or DMA.

> >>2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and driver,
> >>we don't want use MR.
> >I didn't mean Infiniband MR, but memory region returned from standard
> >allocation functions (kmalloc, ...).
> >
> >>3. Now we haven't firmware, maybe we need it next version.
> >You are always invited to add support once it will be needed, no need to
> >add it in advance.
> >
> >Thanks
> 
>
Wei Hu(Xavier) June 22, 2016, 3:53 a.m. UTC | #9
On 2016/6/21 19:55, Leon Romanovsky wrote:
> On Tue, Jun 21, 2016 at 12:37:39PM +0800, Wei Hu (Xavier) wrote:
>>
>> On 2016/6/20 21:04, Leon Romanovsky wrote:
>>> On Mon, Jun 20, 2016 at 05:48:15PM +0800, Wei Hu (Xavier) wrote:
>>>> On 2016/6/20 17:27, Leon Romanovsky wrote:
>>>>> On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:
>>>>>> On 2016/6/20 14:06, Leon Romanovsky wrote:
>>>>>>> On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:
>>>>>>>> On 2016/6/17 17:58, Leon Romanovsky wrote:
>>>>>>>>> On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:
>>>>>>>>>> This patch mainly added icm support for RoCE. It initializes icm
>>>>>>>>>> which managers the relative memory blocks for RoCE. The data
>>>>>>>>>> structures of RoCE will be located in it. For example, CQ table,
>>>>>>>>>> QP table and MTPT table so on.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Wei Hu <xavier.huwei@huawei.com>
>>>>>>>>>> Signed-off-by: Nenglong Zhao <zhaonenglong@hisilicon.com>
>>>>>>>>>> Signed-off-by: Lijun Ou <oulijun@huawei.com>
>>>>>>>>>> ---
>>>>>>>>> <...>
>>>>>>>>>
>>>>>>>>>> +
>>>>>>> Another question which you didn't answer [1].
>>>>>>>
>>>>>>> "I wonder if you have the same needs for ICM as it is in mlx4 device.
>>>>>>> Do you have firmware?"
>>>>>>>
>>>>>>> [1] http://marc.info/?l=linux-rdma&m=146545553104913&w=2
>>>>>> Hi, Leon
>>>>>>      Now we haven't firmware.
>>>>>>      But hardware still need memory for QPC\CQC\MTPT\mtt etc.
>>>>> ICM stands for InfiniHost (Interconnect) Context Memory is a specific
>>>>> memory place to share between host <-> FW and host <-> HW if HW is
>>>>> aware of specific structures.
>>>>>
>>>>> I assume that in your case, it is enough to allocate memory region and
>>>>> supply it to HW. Am I right?
>>>> For Our hardware,
>>>> 1. ICM has a memory management method, It's very good for QPC\CQC\MTPT\mtt
>>>> etc. we need it.
>>> You need special HW to leverage its. AFAIK it is Mellanox specific.
>> For our hardware, we use ICM to memory management, the memory shared with
>> host and HW.
>> QPC\CQC\MTPT\mtt has specific memory requirement.
>> QPC\CQC\MTPT need continuous memory. we use ICM to management the block of
>> memory. It's very good!
> I wasn't convinced why do you need to copy whole ICM logic which is
> specific to Mellanox. Your requirements can be implemented by standard CMA
> and/or DMA.
Hi, Leon

In hip06 soc,
Hardware need multiple memory blocks for QPC\CQC\MTPT, every block has 
continuous memory xxKbyte (like 128Kbyte),
We need to configure the first address of 128Kbyte to hardware.

For example:
//------------------------------------------------------------------------
example 1:
In create qp,
1. If the xx Kbyte memory that include QPC related with qpn, has not 
been allocated, do step 2.
    else do step 3.
2. dma_alloc xx Kbyte memory for QPC,  and configure the first address 
of xx Kbyte to hardware.
3. find the QPC memory in xx Kbyte, get the dma_addr.
4. send mailbox command to hardware to create QP.

In step 2, we call xx_table_get function as below to perform logic.
int hns_roce_table_get(struct hns_roce_dev *hr_dev,
                struct hns_roce_icm_table *table, unsigned long obj)
{
     <snip>
     //dma_alloc_coherent 128Kbyte memory
     hns_roce_alloc_icm(hr_dev,
                   HNS_ROCE_TABLE_CHUNK_SIZE >> PAGE_SHIFT, xxxx);
     <snip>
     /*configure the first address of xx Kbyte to hardware*/
     hns_roce_map_icm(hr_dev, table, obj);
     <snip>
}

In step 3, we call xx_table_find function to perform logic.
void *hns_roce_table_find(struct hns_roce_icm_table *table, unsigned 
long obj,
               dma_addr_t *dma_handle);


example 2:
In modify qp:
1. find the QPC memory,  get the virtual addr.
2. modify the fields of QPC.
3. send mailbox command to hardware to modify QP.

In step 1,  we call xx_table_find function to perform logic.
//--------------------------------------------------------------------------


so, now we haven't a firmware, but ICM algorithm still suitable for 
hip06 soc perfectly.

Regards
Wei Hu
>>>> 2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and driver,
>>>> we don't want use MR.
>>> I didn't mean Infiniband MR, but memory region returned from standard
>>> allocation functions (kmalloc, ...).
>>>
>>>> 3. Now we haven't firmware, maybe we need it next version.
>>> You are always invited to add support once it will be needed, no need to
>>> add it in advance.
>>>
>>> Thanks
>>
diff mbox

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 4805852..f15bf1b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -53,6 +53,22 @@ 
 #define roce_set_bit(origin, shift, val) \
 	roce_set_field((origin), (1ul << (shift)), (shift), (val))
 
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S 0
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M   \
+	(((1UL << 19) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_S 19
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S 20
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M   \
+	(((1UL << 2) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S 22
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M   \
+	(((1UL << 5) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S 31
+
 #define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S 0
 #define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M   \
 	(((1UL << 2) - 1) << ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S)
@@ -93,6 +109,8 @@ 
 #define ROCEE_SYS_IMAGE_GUID_L_REG		0xC
 #define ROCEE_SYS_IMAGE_GUID_H_REG		0x10
 
+#define ROCEE_BT_CMD_H_REG			0x204
+
 #define ROCEE_CAEP_AEQE_CONS_IDX_REG		0x3AC
 #define ROCEE_CAEP_CEQC_CONS_IDX_0_REG		0x3BC
 
@@ -105,6 +123,7 @@ 
 
 #define ROCEE_CAEP_CE_INTERVAL_CFG_REG		0x190
 #define ROCEE_CAEP_CE_BURST_NUM_CFG_REG		0x194
+#define ROCEE_BT_CMD_L_REG			0x200
 
 #define ROCEE_MB1_REG				0x210
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 57184ab..ab9ba61 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -111,6 +111,26 @@  enum {
 	HNS_ROCE_CMD_SUCCESS			= 1,
 };
 
+struct hns_roce_icm_table {
+	/* ICM type: 0 = qpc 1 = mtt 2 = cqc 3 = srq 4 = other */
+	u32		type;
+	/* ICM array elment num */
+	unsigned long	num_icm;
+	/* ICM entry record obj total num */
+	unsigned long	num_obj;
+	/*Single obj size */
+	unsigned long	obj_size;
+	int		lowmem;
+	int		coherent;
+	struct mutex	mutex;
+	struct hns_roce_icm **icm;
+};
+
+struct hns_roce_mr_table {
+	struct hns_roce_icm_table	mtt_table;
+	struct hns_roce_icm_table	mtpt_table;
+};
+
 struct hns_roce_buf_list {
 	void		*buf;
 	dma_addr_t	map;
@@ -126,11 +146,14 @@  struct hns_roce_cq {
 
 struct hns_roce_qp_table {
 	spinlock_t			lock;
+	struct hns_roce_icm_table	qp_table;
+	struct hns_roce_icm_table	irrl_table;
 };
 
 struct hns_roce_cq_table {
 	spinlock_t			lock;
 	struct radix_tree_root		tree;
+	struct hns_roce_icm_table	table;
 };
 
 struct hns_roce_cmd_context {
@@ -259,6 +282,7 @@  struct hns_roce_dev {
 	struct ib_device	ib_dev;
 	struct platform_device  *pdev;
 	const char		*irq_names;
+	spinlock_t		bt_cmd_lock;
 	struct hns_roce_ib_iboe iboe;
 
 	int			irq[HNS_ROCE_MAX_IRQ_NUM];
@@ -273,6 +297,7 @@  struct hns_roce_dev {
 	u32                     hw_rev;
 
 	struct hns_roce_cmdq	cmd;
+	struct hns_roce_mr_table  mr_table;
 	struct hns_roce_cq_table  cq_table;
 	struct hns_roce_qp_table  qp_table;
 	struct hns_roce_eq_table  eq_table;
@@ -282,6 +307,11 @@  struct hns_roce_dev {
 	struct hns_roce_hw	*hw;
 };
 
+static inline void hns_roce_write64_k(__be32 val[2], void __iomem *dest)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
 static inline struct hns_roce_qp
 	*__hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, u32 qpn)
 {
diff --git a/drivers/infiniband/hw/hns/hns_roce_icm.c b/drivers/infiniband/hw/hns/hns_roce_icm.c
new file mode 100644
index 0000000..86be920
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_icm.c
@@ -0,0 +1,460 @@ 
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include "hns_roce_device.h"
+#include "hns_roce_icm.h"
+#include "hns_roce_common.h"
+
+#define HW_SYNC_TIMEOUT_MSECS		500
+
+#define HNS_ROCE_ICM_ALLOC_SIZE		(1 << 17)
+#define HNS_ROCE_TABLE_CHUNK_SIZE	(1 << 17)
+
+#define DMA_ADDR_T_SHIFT		12
+#define BT_CMD_SYNC_SHIFT		31
+#define BT_BA_SHIFT			32
+
+static int hns_roce_alloc_icm_pages(struct scatterlist *mem, int order,
+				    gfp_t gfp_mask)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+
+	return 0;
+}
+
+static int hns_roce_alloc_icm_coherent(struct device *dev,
+				       struct scatterlist *mem, int order,
+				       gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &sg_dma_address(mem), gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	WARN_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct hns_roce_icm *hns_roce_alloc_icm(struct hns_roce_dev *hr_dev, int npages,
+					gfp_t gfp_mask, int coherent)
+{
+	struct hns_roce_icm_chunk *chunk = NULL;
+	struct hns_roce_icm *icm;
+	int cur_order;
+	int ret;
+
+	WARN_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof(*icm),
+		      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return NULL;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(HNS_ROCE_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof(*chunk),
+				gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, HNS_ROCE_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = hns_roce_alloc_icm_coherent(&hr_dev->pdev->dev,
+						&chunk->mem[chunk->npages],
+						cur_order, gfp_mask);
+		else
+			ret = hns_roce_alloc_icm_pages(
+						&chunk->mem[chunk->npages],
+						cur_order, gfp_mask);
+		if (!ret) {
+			++chunk->npages;
+			if (coherent) {
+				++chunk->nsg;
+			} else if (chunk->npages == HNS_ROCE_ICM_CHUNK_LEN) {
+				chunk->nsg = dma_map_sg(&hr_dev->pdev->dev,
+						      chunk->mem, chunk->npages,
+						      DMA_BIDIRECTIONAL);
+				if (chunk->nsg <= 0)
+					goto fail;
+
+				chunk = NULL;
+			}
+			npages -= 1 << cur_order;
+		} else {
+			/*
+			* If failed on alloc 128k memory one time,
+			* no alloc small block memory,
+			* directly return fail
+			*/
+			goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = dma_map_sg(&hr_dev->pdev->dev, chunk->mem,
+					chunk->npages, DMA_BIDIRECTIONAL);
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	hns_roce_free_icm(hr_dev, icm, coherent);
+	return NULL;
+}
+
+static void hns_roce_free_icm_pages(struct hns_roce_dev *hr_dev,
+				    struct hns_roce_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		dma_unmap_sg(&hr_dev->pdev->dev, chunk->mem, chunk->npages,
+			     DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void hns_roce_free_icm_coherent(struct hns_roce_dev *hr_dev,
+				       struct hns_roce_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&hr_dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+}
+
+void hns_roce_free_icm(struct hns_roce_dev *hr_dev, struct hns_roce_icm *icm,
+		       int coherent)
+{
+	struct hns_roce_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			hns_roce_free_icm_coherent(hr_dev, chunk);
+		else
+			hns_roce_free_icm_pages(hr_dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int hns_roce_map_icm(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_icm_table *table, unsigned long obj)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	spinlock_t *lock = &hr_dev->bt_cmd_lock;
+	unsigned long end = 0;
+	unsigned long flags;
+	struct hns_roce_icm_iter iter;
+	void __iomem *bt_cmd;
+	u32 bt_cmd_h_val = 0;
+	u32 bt_cmd_val[2];
+	u32 bt_cmd_l = 0;
+	u64 bt_ba = 0;
+	int ret = 0;
+
+	/* Find the icm entry */
+	unsigned long i = (obj & (table->num_obj - 1)) /
+			  (HNS_ROCE_TABLE_CHUNK_SIZE / table->obj_size);
+
+	switch (table->type) {
+	case ICM_TYPE_QPC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, ICM_TYPE_QPC);
+		break;
+	case ICM_TYPE_MTPT:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       ICM_TYPE_MTPT);
+		break;
+	case ICM_TYPE_CQC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, ICM_TYPE_CQC);
+		break;
+	case ICM_TYPE_SRQC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       ICM_TYPE_SRQC);
+		break;
+	default:
+		return ret;
+	}
+	roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
+		       ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
+
+	/* Currently iter only a chunk */
+	for (hns_roce_icm_first(table->icm[i], &iter);
+	     !hns_roce_icm_last(&iter); hns_roce_icm_next(&iter)) {
+		bt_ba = hns_roce_icm_addr(&iter) >> DMA_ADDR_T_SHIFT;
+
+		spin_lock_irqsave(lock, flags);
+
+		bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
+
+		end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
+		while (1) {
+			if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
+				if (!(time_before(jiffies, end))) {
+					dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+					spin_unlock_irqrestore(lock, flags);
+					ret = -EBUSY;
+					return ret;
+				}
+			} else {
+				break;
+			}
+			msleep(20);
+		}
+
+		bt_cmd_l = (u32)bt_ba;
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S,
+			       bt_ba >> BT_BA_SHIFT);
+
+		bt_cmd_val[0] = bt_cmd_l;
+		bt_cmd_val[1] = bt_cmd_h_val;
+		hns_roce_write64_k(bt_cmd_val,
+				   hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
+		spin_unlock_irqrestore(lock, flags);
+	}
+
+	return ret;
+}
+
+static int hns_roce_unmap_icm(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_icm_table *table,
+			      unsigned long obj)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	unsigned long end = 0;
+	unsigned long flags;
+	void __iomem *bt_cmd;
+	uint32_t bt_cmd_val[2];
+	u32 bt_cmd_h_val = 0;
+	int ret = 0;
+
+	switch (table->type) {
+	case ICM_TYPE_QPC:
+		dev_dbg(dev, "UNMAP QPC BT  :\n");
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, ICM_TYPE_QPC);
+		break;
+	case ICM_TYPE_MTPT:
+		dev_dbg(dev, "UNMAP MTPT BT :\n");
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       ICM_TYPE_MTPT);
+		break;
+	case ICM_TYPE_CQC:
+		dev_dbg(dev, "UNMAP CQC BT  :\n");
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, ICM_TYPE_CQC);
+		break;
+	case ICM_TYPE_SRQC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       ICM_TYPE_SRQC);
+		break;
+	default:
+		return ret;
+	}
+	roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
+		       ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 1);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
+	roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
+		       ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S, 0);
+
+	spin_lock_irqsave(&hr_dev->bt_cmd_lock, flags);
+
+	bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
+
+	end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
+	while (1) {
+		if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
+			if (!(time_before(jiffies, end))) {
+				dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+				spin_unlock_irqrestore(&hr_dev->bt_cmd_lock,
+						       flags);
+				return -EBUSY;
+			}
+		} else {
+			break;
+		}
+		msleep(20);
+	}
+
+	bt_cmd_val[0] = 0;
+	bt_cmd_val[1] = bt_cmd_h_val;
+	hns_roce_write64_k(bt_cmd_val, hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
+	spin_unlock_irqrestore(&hr_dev->bt_cmd_lock, flags);
+
+	return ret;
+}
+
+int hns_roce_init_icm_table(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_icm_table *table, u32 type,
+			    unsigned long obj_size, unsigned long nobj,
+			    int reserved, int use_lowmem, int use_coherent)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 chunk_size;
+	unsigned long obj_per_chunk;
+	unsigned long num_icm;
+	unsigned long i;
+
+	obj_per_chunk = HNS_ROCE_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+
+	table->type = type;
+	table->num_icm = num_icm;
+	table->num_obj = nobj;
+	table->obj_size = obj_size;
+	table->lowmem = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i * HNS_ROCE_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = HNS_ROCE_TABLE_CHUNK_SIZE;
+		if ((i + 1) * HNS_ROCE_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = PAGE_ALIGN(nobj * obj_size -
+						i * HNS_ROCE_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = hns_roce_alloc_icm(hr_dev,
+				chunk_size >> PAGE_SHIFT, (use_lowmem ?
+				GFP_KERNEL : GFP_HIGHUSER) | __GFP_NOWARN,
+				use_coherent);
+		if (!table->icm[i])
+			goto error_failed_alloc_icm;
+
+		if (hns_roce_map_icm(hr_dev, table,
+			i * HNS_ROCE_TABLE_CHUNK_SIZE / obj_size)) {
+			dev_err(dev, "map icm table failed.\n");
+			goto error_failed_alloc_icm;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * Gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+error_failed_alloc_icm:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			if (hns_roce_unmap_icm(hr_dev, table,
+				i * HNS_ROCE_TABLE_CHUNK_SIZE / obj_size))
+				dev_err(dev, "unmap icm table failed.\n");
+
+			hns_roce_free_icm(hr_dev, table->icm[i], use_coherent);
+		}
+
+	kfree(table->icm);
+
+	return -ENOMEM;
+}
+
+void hns_roce_cleanup_icm_table(struct hns_roce_dev *hr_dev,
+				struct hns_roce_icm_table *table)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	unsigned long i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			if (hns_roce_unmap_icm(hr_dev, table,
+			    i * HNS_ROCE_TABLE_CHUNK_SIZE / table->obj_size))
+				dev_err(dev, "unmap icm table failed.\n");
+
+			hns_roce_free_icm(hr_dev, table->icm[i],
+					  table->coherent);
+		}
+
+	kfree(table->icm);
+}
+
+void hns_roce_cleanup_icm(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_cleanup_icm_table(hr_dev, &hr_dev->cq_table.table);
+	hns_roce_cleanup_icm_table(hr_dev, &hr_dev->qp_table.irrl_table);
+	hns_roce_cleanup_icm_table(hr_dev, &hr_dev->qp_table.qp_table);
+	hns_roce_cleanup_icm_table(hr_dev, &hr_dev->mr_table.mtpt_table);
+	hns_roce_cleanup_icm_table(hr_dev, &hr_dev->mr_table.mtt_table);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_icm.h b/drivers/infiniband/hw/hns/hns_roce_icm.h
new file mode 100644
index 0000000..719b64e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_icm.h
@@ -0,0 +1,119 @@ 
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_ICM_H
+#define _HNS_ROCE_ICM_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+enum {
+	/* MAP ICM */
+	ICM_TYPE_QPC = 0,
+	ICM_TYPE_MTPT,
+	ICM_TYPE_CQC,
+	ICM_TYPE_SRQC,
+
+	 /* UNMAP ICM */
+	ICM_TYPE_MTT,
+	ICM_TYPE_IRRL,
+};
+
+#define HNS_ROCE_ICM_CHUNK_LEN	\
+	 ((256 - sizeof(struct list_head) - 2 * sizeof(int)) /	 \
+	 (sizeof(struct scatterlist)))
+
+struct hns_roce_icm_chunk {
+	struct list_head	 list;
+	int			 npages;
+	int			 nsg;
+	struct scatterlist	 mem[HNS_ROCE_ICM_CHUNK_LEN];
+};
+
+struct hns_roce_icm {
+	struct list_head	 chunk_list;
+	int			 refcount;
+};
+
+struct hns_roce_icm_iter {
+	struct hns_roce_icm		 *icm;
+	struct hns_roce_icm_chunk	 *chunk;
+	int				 page_idx;
+};
+
+void hns_roce_free_icm(struct hns_roce_dev *hr_dev,
+		       struct hns_roce_icm *icm, int coherent);
+int hns_roce_init_icm_table(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_icm_table *table, u32 type,
+			    unsigned long obj_size, unsigned long nobj,
+			    int reserved, int use_lowmem, int use_coherent);
+void hns_roce_cleanup_icm_table(struct hns_roce_dev *hr_dev,
+				struct hns_roce_icm_table *table);
+void hns_roce_cleanup_icm(struct hns_roce_dev *hr_dev);
+
+static inline void hns_roce_icm_first(struct hns_roce_icm *icm,
+				      struct hns_roce_icm_iter *iter)
+{
+	iter->icm = icm;
+	iter->chunk = list_empty(&icm->chunk_list) ? NULL :
+				 list_entry(icm->chunk_list.next,
+					    struct hns_roce_icm_chunk, list);
+	 iter->page_idx = 0;
+}
+
+static inline int hns_roce_icm_last(struct hns_roce_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void hns_roce_icm_next(struct hns_roce_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct hns_roce_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t hns_roce_icm_addr(struct hns_roce_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+#endif /*_HNS_ROCE_ICM_H*/
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 0b9cee7..3928ebb 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -59,6 +59,7 @@ 
 #include <rdma/ib_verbs.h>
 #include "hns_roce_common.h"
 #include "hns_roce_device.h"
+#include "hns_roce_icm.h"
 
 static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev)
 {
@@ -131,6 +132,77 @@  static void hns_roce_profile_init(struct hns_roce_dev *hr_dev)
 	hr_dev->hw->hw_profile(hr_dev);
 }
 
+static int hns_roce_init_icm(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	struct device *dev = &hr_dev->pdev->dev;
+
+	ret = hns_roce_init_icm_table(hr_dev,
+				      (void *)&hr_dev->mr_table.mtt_table,
+				      ICM_TYPE_MTT, hr_dev->caps.mtt_entry_sz,
+				      hr_dev->caps.num_mtt_segs, 0, 1, 0);
+	if (ret) {
+		dev_err(dev, "Failed to map MTT context memory, aborting.\n");
+		return ret;
+	}
+
+	ret = hns_roce_init_icm_table(hr_dev,
+				      (void *)&hr_dev->mr_table.mtpt_table,
+				      ICM_TYPE_MTPT, hr_dev->caps.mtpt_entry_sz,
+				      hr_dev->caps.num_mtpts, 0, 1, 1);
+	if (ret) {
+		dev_err(dev, "Failed to map dMPT context memory, aborting.\n");
+		goto err_unmap_mtt;
+	}
+
+	ret = hns_roce_init_icm_table(hr_dev,
+				      (void *)&hr_dev->qp_table.qp_table,
+				      ICM_TYPE_QPC, hr_dev->caps.qpc_entry_sz,
+				      hr_dev->caps.num_qps, 0, 1, 0);
+	if (ret) {
+		dev_err(dev, "Failed to map QP context memory, aborting.\n");
+		goto err_unmap_dmpt;
+	}
+
+	ret = hns_roce_init_icm_table(hr_dev,
+				      (void *)&hr_dev->qp_table.irrl_table,
+				      ICM_TYPE_IRRL,
+				      hr_dev->caps.irrl_entry_sz *
+				      hr_dev->caps.max_qp_init_rdma,
+				      hr_dev->caps.num_qps, 0, 1, 0);
+	if (ret) {
+		dev_err(dev, "Failed to map irrl_table memory, aborting.\n");
+		goto err_unmap_qp;
+	}
+
+	ret = hns_roce_init_icm_table(hr_dev,
+				      (void *)&hr_dev->cq_table.table,
+				      ICM_TYPE_CQC, hr_dev->caps.cqc_entry_sz,
+				      hr_dev->caps.num_cqs, 0, 1, 0);
+	if (ret) {
+		dev_err(dev, "Failed to map CQ context memory, aborting.\n");
+		goto err_unmap_irrl;
+	}
+
+	return 0;
+
+err_unmap_irrl:
+	hns_roce_cleanup_icm_table(hr_dev,
+				   (void *)&hr_dev->qp_table.irrl_table);
+
+err_unmap_qp:
+	hns_roce_cleanup_icm_table(hr_dev, (void *)&hr_dev->qp_table.qp_table);
+
+err_unmap_dmpt:
+	hns_roce_cleanup_icm_table(hr_dev,
+				   (void *)&hr_dev->mr_table.mtpt_table);
+
+err_unmap_mtt:
+	hns_roce_cleanup_icm_table(hr_dev, (void *)&hr_dev->mr_table.mtt_table);
+
+	return ret;
+}
+
 /**
 * hns_roce_probe - RoCE driver entrance
 * @pdev: pointer to platform device
@@ -197,6 +269,16 @@  static int hns_roce_probe(struct platform_device *pdev)
 		}
 	}
 
+	ret = hns_roce_init_icm(hr_dev);
+	if (ret) {
+		dev_err(dev, "init icm fail!\n");
+		goto error_failed_init_icm;
+	}
+
+error_failed_init_icm:
+	if (hr_dev->cmd_mod)
+		hns_roce_cmd_use_polling(hr_dev);
+
 error_failed_use_event:
 	hns_roce_cleanup_eq_table(hr_dev);
 
@@ -222,6 +304,8 @@  static int hns_roce_remove(struct platform_device *pdev)
 {
 	struct hns_roce_dev *hr_dev = platform_get_drvdata(pdev);
 
+	hns_roce_cleanup_icm(hr_dev);
+
 	if (hr_dev->cmd_mod)
 		hns_roce_cmd_use_polling(hr_dev);