diff mbox

[RFC,3/8] nvmet: Use p2pmem in nvme target

Message ID 1490911959-5146-4-git-send-email-logang@deltatee.com
State Not Applicable
Headers show

Commit Message

Logan Gunthorpe March 30, 2017, 10:12 p.m. UTC
We create a configfs attribute in each nvme-fabrics target port to
enable p2p memory use. When enabled, the port will only then use the
p2p memory if a p2p memory device can be found which is behind the
same switch as the RDMA port and all the block devices in use. If
the user enabled it an no devices are found, then the system will
silently fall back on using regular memory.

If appropriate, that port will allocate memory for the RDMA buffers
for queues from the p2pmem device falling back to system memory should
anything fail.

Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
save an extra PCI transfer as the NVME card could just take the data
out of it's own memory. However, at this time, cards with CMB buffers
don't seem to be available.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/nvme/target/configfs.c | 31 +++++++++++++++
 drivers/nvme/target/nvmet.h    |  1 +
 drivers/nvme/target/rdma.c     | 90 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 114 insertions(+), 8 deletions(-)

Comments

Sagi Grimberg April 4, 2017, 10:40 a.m. UTC | #1
Hey Logan,

> We create a configfs attribute in each nvme-fabrics target port to
> enable p2p memory use. When enabled, the port will only then use the
> p2p memory if a p2p memory device can be found which is behind the
> same switch as the RDMA port and all the block devices in use. If
> the user enabled it an no devices are found, then the system will
> silently fall back on using regular memory.

What should we do if we have more than a single device that satisfies
this? I'd say that it would be better to have the user ask for a
specific device and fail it if it doesn't meet the above conditions...

> If appropriate, that port will allocate memory for the RDMA buffers
> for queues from the p2pmem device falling back to system memory should
> anything fail.

That's good :)

> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
> save an extra PCI transfer as the NVME card could just take the data
> out of it's own memory. However, at this time, cards with CMB buffers
> don't seem to be available.

Even if it was available, it would be hard to make real use of this
given that we wouldn't know how to pre-post recv buffers (for in-capsule
data). But let's leave this out of the scope entirely...

> diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
> index ecc4fe8..7fd4840 100644
> --- a/drivers/nvme/target/rdma.c
> +++ b/drivers/nvme/target/rdma.c
> @@ -23,6 +23,7 @@
>  #include <linux/string.h>
>  #include <linux/wait.h>
>  #include <linux/inet.h>
> +#include <linux/p2pmem.h>
>  #include <asm/unaligned.h>
>
>  #include <rdma/ib_verbs.h>
> @@ -64,6 +65,7 @@ struct nvmet_rdma_rsp {
>  	struct rdma_rw_ctx	rw;
>
>  	struct nvmet_req	req;
> +	struct p2pmem_dev       *p2pmem;

Why do you need this? you have a reference to the
queue itself.

> @@ -107,6 +109,8 @@ struct nvmet_rdma_queue {
>  	int			send_queue_size;
>
>  	struct list_head	queue_list;
> +
> +	struct p2pmem_dev	*p2pmem;
>  };
>
>  struct nvmet_rdma_device {
> @@ -185,7 +189,8 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
>  	spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
>  }
>
> -static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
> +static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents,
> +				struct p2pmem_dev *p2pmem)
>  {
>  	struct scatterlist *sg;
>  	int count;
> @@ -193,13 +198,17 @@ static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
>  	if (!sgl || !nents)
>  		return;
>
> -	for_each_sg(sgl, sg, nents, count)
> -		__free_page(sg_page(sg));
> +	for_each_sg(sgl, sg, nents, count) {
> +		if (p2pmem)
> +			p2pmem_free_page(p2pmem, sg_page(sg));
> +		else
> +			__free_page(sg_page(sg));
> +	}
>  	kfree(sgl);
>  }
>
>  static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
> -		u32 length)
> +		u32 length, struct p2pmem_dev *p2pmem)
>  {
>  	struct scatterlist *sg;
>  	struct page *page;
> @@ -216,7 +225,11 @@ static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
>  	while (length) {
>  		u32 page_len = min_t(u32, length, PAGE_SIZE);
>
> -		page = alloc_page(GFP_KERNEL);
> +		if (p2pmem)
> +			page = p2pmem_alloc_page(p2pmem);
> +		else
> +			page = alloc_page(GFP_KERNEL);
> +
>  		if (!page)
>  			goto out_free_pages;
>
> @@ -231,7 +244,10 @@ static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
>  out_free_pages:
>  	while (i > 0) {
>  		i--;
> -		__free_page(sg_page(&sg[i]));
> +		if (p2pmem)
> +			p2pmem_free_page(p2pmem, sg_page(&sg[i]));
> +		else
> +			__free_page(sg_page(&sg[i]));
>  	}
>  	kfree(sg);
>  out:
> @@ -484,7 +500,8 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
>  	}
>
>  	if (rsp->req.sg != &rsp->cmd->inline_sg)
> -		nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
> +		nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt,
> +				    rsp->p2pmem);
>
>  	if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
>  		nvmet_rdma_process_wr_wait_list(queue);
> @@ -625,8 +642,16 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
>  	if (!len)
>  		return 0;
>
> +	rsp->p2pmem = rsp->queue->p2pmem;
>  	status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
> -			len);
> +			len, rsp->p2pmem);
> +
> +	if (status && rsp->p2pmem) {
> +		rsp->p2pmem = NULL;
> +		status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
> +					      len, rsp->p2pmem);
> +	}
> +

Not sure its a good practice to rely on rsp->p2pmem not being NULL...
Would be nice if the allocation routines can hide it from us...

>  	if (status)
>  		return status;
>
> @@ -984,6 +1009,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
>  				!queue->host_qid);
>  	}
>  	nvmet_rdma_free_rsps(queue);
> +	p2pmem_put(queue->p2pmem);

What does this pair with? p2pmem_find_compat()?

>  	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
>  	kfree(queue);
>  }
> @@ -1179,6 +1205,52 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
>  	return ret;
>  }
>
> +/*
> + * If allow_p2pmem is set, we will try to use P2P memory for our
> + * sgl lists. This requires the p2pmem device to be compatible with
> + * the backing device for every namespace this device will support.
> + * If not, we fall back on using system memory.
> + */
> +static void nvmet_rdma_queue_setup_p2pmem(struct nvmet_rdma_queue *queue)
> +{
> +	struct device **dma_devs;
> +	struct nvmet_ns *ns;
> +	int ndevs = 1;
> +	int i = 0;
> +	struct nvmet_subsys_link *s;
> +
> +	if (!queue->port->allow_p2pmem)
> +		return;
> +
> +	list_for_each_entry(s, &queue->port->subsystems, entry) {
> +		list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
> +			ndevs++;
> +		}
> +	}

This code has no business in nvmet-rdma. Why not keep nr_ns in
nvmet_subsys in the first place?

> +
> +	dma_devs = kmalloc((ndevs + 1) * sizeof(*dma_devs), GFP_KERNEL);
> +	if (!dma_devs)
> +		return;
> +
> +	dma_devs[i++] = &queue->dev->device->dev;
> +
> +	list_for_each_entry(s, &queue->port->subsystems, entry) {
> +		list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
> +			dma_devs[i++] = disk_to_dev(ns->bdev->bd_disk);
> +		}
> +	}
> +
> +	dma_devs[i++] = NULL;
> +
> +	queue->p2pmem = p2pmem_find_compat(dma_devs);

This is a problem. namespaces can be added at any point in time. No one
guarantee that dma_devs are all the namepaces we'll ever see.

> +
> +	if (queue->p2pmem)
> +		pr_debug("using %s for rdma nvme target queue",
> +			 dev_name(&queue->p2pmem->dev));
> +
> +	kfree(dma_devs);
> +}
> +
>  static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
>  		struct rdma_cm_event *event)
>  {
> @@ -1199,6 +1271,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
>  	}
>  	queue->port = cm_id->context;
>
> +	nvmet_rdma_queue_setup_p2pmem(queue);
> +

Why is all this done for each queue? looks completely redundant to me.

>  	ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
>  	if (ret)
>  		goto release_queue;

You seemed to skip the in-capsule buffers for p2pmem (inline_page), I'm
curious why?
Logan Gunthorpe April 4, 2017, 4:16 p.m. UTC | #2
On 04/04/17 04:40 AM, Sagi Grimberg wrote:
> Hey Logan,
> 
>> We create a configfs attribute in each nvme-fabrics target port to
>> enable p2p memory use. When enabled, the port will only then use the
>> p2p memory if a p2p memory device can be found which is behind the
>> same switch as the RDMA port and all the block devices in use. If
>> the user enabled it an no devices are found, then the system will
>> silently fall back on using regular memory.
> 
> What should we do if we have more than a single device that satisfies
> this? I'd say that it would be better to have the user ask for a
> specific device and fail it if it doesn't meet the above conditions...

I hadn't done this yet but I think a simple closest device in the tree
would solve the issue sufficiently. However, I originally had it so the
user has to pick the device and I prefer that approach. But if the user
picks the device, then why bother restricting what he picks? Per the
thread with Sinan, I'd prefer to use what the user picks. You were one
of the biggest opponents to that so I'd like to hear your opinion on
removing the restrictions.

>> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
>> save an extra PCI transfer as the NVME card could just take the data
>> out of it's own memory. However, at this time, cards with CMB buffers
>> don't seem to be available.
> 
> Even if it was available, it would be hard to make real use of this
> given that we wouldn't know how to pre-post recv buffers (for in-capsule
> data). But let's leave this out of the scope entirely...

I don't understand what you're referring to. We'd simply use the CMB
buffer as a p2pmem device, why does that change anything?

> Why do you need this? you have a reference to the
> queue itself.

This keeps track of whether the response was actually allocated with
p2pmem or not. It's needed for when we free the SGL because the queue
may have a p2pmem device assigned to it but, if the alloc failed and it
fell back on system memory then we need to know how to free it. I'm
currently looking at having SGLs having an iomem flag. In which case,
this would no longer be needed as the flag in the SGL could be used.


>> +    rsp->p2pmem = rsp->queue->p2pmem;
>>      status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
>> -            len);
>> +            len, rsp->p2pmem);
>> +
>> +    if (status && rsp->p2pmem) {
>> +        rsp->p2pmem = NULL;
>> +        status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
>> +                          len, rsp->p2pmem);
>> +    }
>> +
> 
> Not sure its a good practice to rely on rsp->p2pmem not being NULL...
> Would be nice if the allocation routines can hide it from us...

I'm not sure what the reasoning is behind your NULL comment.

Yes, I'm currently considering pushing an alloc/free sgl into the p2pmem
code.

>>      if (status)
>>          return status;
>>
>> @@ -984,6 +1009,7 @@ static void nvmet_rdma_free_queue(struct
>> nvmet_rdma_queue *queue)
>>                  !queue->host_qid);
>>      }
>>      nvmet_rdma_free_rsps(queue);
>> +    p2pmem_put(queue->p2pmem);
> 
> What does this pair with? p2pmem_find_compat()?

Yes, that's correct.


>> +static void nvmet_rdma_queue_setup_p2pmem(struct nvmet_rdma_queue
>> *queue)
>> +{
>> +    struct device **dma_devs;
>> +    struct nvmet_ns *ns;
>> +    int ndevs = 1;
>> +    int i = 0;
>> +    struct nvmet_subsys_link *s;
>> +
>> +    if (!queue->port->allow_p2pmem)
>> +        return;
>> +
>> +    list_for_each_entry(s, &queue->port->subsystems, entry) {
>> +        list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
>> +            ndevs++;
>> +        }
>> +    }
> 
> This code has no business in nvmet-rdma. Why not keep nr_ns in
> nvmet_subsys in the first place?

That makes sense.

>> +
>> +    dma_devs = kmalloc((ndevs + 1) * sizeof(*dma_devs), GFP_KERNEL);
>> +    if (!dma_devs)
>> +        return;
>> +
>> +    dma_devs[i++] = &queue->dev->device->dev;
>> +
>> +    list_for_each_entry(s, &queue->port->subsystems, entry) {
>> +        list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
>> +            dma_devs[i++] = disk_to_dev(ns->bdev->bd_disk);
>> +        }
>> +    }
>> +
>> +    dma_devs[i++] = NULL;
>> +
>> +    queue->p2pmem = p2pmem_find_compat(dma_devs);
> 
> This is a problem. namespaces can be added at any point in time. No one
> guarantee that dma_devs are all the namepaces we'll ever see.

Yeah, well restricting p2pmem based on all the devices in use is hard.
So we'd need a call into the transport every time an ns is added and
we'd have to drop the p2pmem if they add one that isn't supported. This
complexity is just one of the reasons I prefer just letting the user chose.

>> +
>> +    if (queue->p2pmem)
>> +        pr_debug("using %s for rdma nvme target queue",
>> +             dev_name(&queue->p2pmem->dev));
>> +
>> +    kfree(dma_devs);
>> +}
>> +
>>  static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
>>          struct rdma_cm_event *event)
>>  {
>> @@ -1199,6 +1271,8 @@ static int nvmet_rdma_queue_connect(struct
>> rdma_cm_id *cm_id,
>>      }
>>      queue->port = cm_id->context;
>>
>> +    nvmet_rdma_queue_setup_p2pmem(queue);
>> +
> 
> Why is all this done for each queue? looks completely redundant to me.

A little bit. Where would you put it?

>>      ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
>>      if (ret)
>>          goto release_queue;
> 
> You seemed to skip the in-capsule buffers for p2pmem (inline_page), I'm
> curious why?

Yes, the thinking was that these transfers were small anyway so there
would not be significant benefit to pushing them through p2pmem. There's
really no reason why we couldn't do that if it made sense to though.

Logan
Sagi Grimberg April 6, 2017, 5:47 a.m. UTC | #3
> I hadn't done this yet but I think a simple closest device in the tree
> would solve the issue sufficiently. However, I originally had it so the
> user has to pick the device and I prefer that approach. But if the user
> picks the device, then why bother restricting what he picks?

Because the user can get it wrong, and its our job to do what we can in
order to prevent the user from screwing itself.

> Per the
> thread with Sinan, I'd prefer to use what the user picks. You were one
> of the biggest opponents to that so I'd like to hear your opinion on
> removing the restrictions.

I wasn't against it that much, I'm all for making things "just work"
with minimal configuration steps, but I'm not sure we can get it
right without it.

>>> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
>>> save an extra PCI transfer as the NVME card could just take the data
>>> out of it's own memory. However, at this time, cards with CMB buffers
>>> don't seem to be available.
>>
>> Even if it was available, it would be hard to make real use of this
>> given that we wouldn't know how to pre-post recv buffers (for in-capsule
>> data). But let's leave this out of the scope entirely...
>
> I don't understand what you're referring to. We'd simply use the CMB
> buffer as a p2pmem device, why does that change anything?

I'm referring to the in-capsule data buffers pre-posts that we do.
Because we prepare a buffer that would contain in-capsule data, we have
no knowledge to which device the incoming I/O is directed to, which
means we can (and will) have I/O where the data lies in CMB of device
A but it's really targeted to device B - which sorta defeats the purpose
of what we're trying to optimize here...

>> Why do you need this? you have a reference to the
>> queue itself.
>
> This keeps track of whether the response was actually allocated with
> p2pmem or not. It's needed for when we free the SGL because the queue
> may have a p2pmem device assigned to it but, if the alloc failed and it
> fell back on system memory then we need to know how to free it. I'm
> currently looking at having SGLs having an iomem flag. In which case,
> this would no longer be needed as the flag in the SGL could be used.

That would be better, maybe...

[...]

>> This is a problem. namespaces can be added at any point in time. No one
>> guarantee that dma_devs are all the namepaces we'll ever see.
>
> Yeah, well restricting p2pmem based on all the devices in use is hard.
> So we'd need a call into the transport every time an ns is added and
> we'd have to drop the p2pmem if they add one that isn't supported. This
> complexity is just one of the reasons I prefer just letting the user chose.

Still the user can get it wrong. Not sure we can get a way without
keeping track of this as new devices join the subsystem.

>>> +
>>> +    if (queue->p2pmem)
>>> +        pr_debug("using %s for rdma nvme target queue",
>>> +             dev_name(&queue->p2pmem->dev));
>>> +
>>> +    kfree(dma_devs);
>>> +}
>>> +
>>>  static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
>>>          struct rdma_cm_event *event)
>>>  {
>>> @@ -1199,6 +1271,8 @@ static int nvmet_rdma_queue_connect(struct
>>> rdma_cm_id *cm_id,
>>>      }
>>>      queue->port = cm_id->context;
>>>
>>> +    nvmet_rdma_queue_setup_p2pmem(queue);
>>> +
>>
>> Why is all this done for each queue? looks completely redundant to me.
>
> A little bit. Where would you put it?

I think we'll need a representation of a controller in nvmet-rdma for
that. we sort of got a way without it so far, but I don't think we can
anymore with this.

>>>      ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
>>>      if (ret)
>>>          goto release_queue;
>>
>> You seemed to skip the in-capsule buffers for p2pmem (inline_page), I'm
>> curious why?
>
> Yes, the thinking was that these transfers were small anyway so there
> would not be significant benefit to pushing them through p2pmem. There's
> really no reason why we couldn't do that if it made sense to though.

I don't see an urgent reason for it too. I was just curious...
Logan Gunthorpe April 6, 2017, 3:52 p.m. UTC | #4
Hey Sagi,

On 05/04/17 11:47 PM, Sagi Grimberg wrote:
> Because the user can get it wrong, and its our job to do what we can in
> order to prevent the user from screwing itself.

Well, "screwing" themselves seems a bit strong. It wouldn't be much
different from a lot of other tunables in the system. For example, it
would be similar to the user choosing the wrong io scheduler for their
disk or workload. If you change this setting without measuring
performance you probably don't care too much about the result anyway.

> I wasn't against it that much, I'm all for making things "just work"
> with minimal configuration steps, but I'm not sure we can get it
> right without it.

Ok, well in that case I may reconsider this in the next series.

>>>> Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
>>>> save an extra PCI transfer as the NVME card could just take the data
>>>> out of it's own memory. However, at this time, cards with CMB buffers
>>>> don't seem to be available.
>>>
>>> Even if it was available, it would be hard to make real use of this
>>> given that we wouldn't know how to pre-post recv buffers (for in-capsule
>>> data). But let's leave this out of the scope entirely...
>>
>> I don't understand what you're referring to. We'd simply use the CMB
>> buffer as a p2pmem device, why does that change anything?
> 
> I'm referring to the in-capsule data buffers pre-posts that we do.
> Because we prepare a buffer that would contain in-capsule data, we have
> no knowledge to which device the incoming I/O is directed to, which
> means we can (and will) have I/O where the data lies in CMB of device
> A but it's really targeted to device B - which sorta defeats the purpose
> of what we're trying to optimize here...

Well, the way I've had it is that each port gets one p2pmem device. So
you'd only want to put NVMe devices that will work with that p2pmem
device behind that port. Though, I can see that being a difficult
restriction seeing it probably means you'll need to have one port per
nvme device if you want to use the CMB buffer of each device. I'll have
to think about that some. Also, it's worth noting that we aren't even
optimizing in-capsule data at this time.


> Still the user can get it wrong. Not sure we can get a way without
> keeping track of this as new devices join the subsystem.

Yeah, I understand. I'll have to think some more about all of this. I'm
starting to see some ways to improve thing.s

Thanks,

Logan
diff mbox

Patch

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index be8c800..e61a7f4 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -777,12 +777,43 @@  static void nvmet_port_release(struct config_item *item)
 	kfree(port);
 }
 
+#ifdef CONFIG_P2PMEM
+static ssize_t nvmet_allow_p2pmem_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%d\n", to_nvmet_port(item)->allow_p2pmem);
+}
+
+static ssize_t nvmet_allow_p2pmem_store(struct config_item *item,
+					const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	bool allow;
+	int ret;
+
+	ret = strtobool(page, &allow);
+	if (ret)
+		return ret;
+
+	down_write(&nvmet_config_sem);
+	port->allow_p2pmem = allow;
+	up_write(&nvmet_config_sem);
+
+	return count;
+}
+CONFIGFS_ATTR(nvmet_, allow_p2pmem);
+#endif
+
 static struct configfs_attribute *nvmet_port_attrs[] = {
 	&nvmet_attr_addr_adrfam,
 	&nvmet_attr_addr_treq,
 	&nvmet_attr_addr_traddr,
 	&nvmet_attr_addr_trsvcid,
 	&nvmet_attr_addr_trtype,
+
+	#ifdef CONFIG_P2PMEM
+	&nvmet_attr_allow_p2pmem,
+	#endif
+
 	NULL,
 };
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index f7ff15f..ab67175 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -95,6 +95,7 @@  struct nvmet_port {
 	struct list_head		referrals;
 	void				*priv;
 	bool				enabled;
+	bool				allow_p2pmem;
 };
 
 static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ecc4fe8..7fd4840 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -23,6 +23,7 @@ 
 #include <linux/string.h>
 #include <linux/wait.h>
 #include <linux/inet.h>
+#include <linux/p2pmem.h>
 #include <asm/unaligned.h>
 
 #include <rdma/ib_verbs.h>
@@ -64,6 +65,7 @@  struct nvmet_rdma_rsp {
 	struct rdma_rw_ctx	rw;
 
 	struct nvmet_req	req;
+	struct p2pmem_dev       *p2pmem;
 
 	u8			n_rdma;
 	u32			flags;
@@ -107,6 +109,8 @@  struct nvmet_rdma_queue {
 	int			send_queue_size;
 
 	struct list_head	queue_list;
+
+	struct p2pmem_dev	*p2pmem;
 };
 
 struct nvmet_rdma_device {
@@ -185,7 +189,8 @@  nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 	spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 }
 
-static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
+static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents,
+				struct p2pmem_dev *p2pmem)
 {
 	struct scatterlist *sg;
 	int count;
@@ -193,13 +198,17 @@  static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
 	if (!sgl || !nents)
 		return;
 
-	for_each_sg(sgl, sg, nents, count)
-		__free_page(sg_page(sg));
+	for_each_sg(sgl, sg, nents, count) {
+		if (p2pmem)
+			p2pmem_free_page(p2pmem, sg_page(sg));
+		else
+			__free_page(sg_page(sg));
+	}
 	kfree(sgl);
 }
 
 static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
-		u32 length)
+		u32 length, struct p2pmem_dev *p2pmem)
 {
 	struct scatterlist *sg;
 	struct page *page;
@@ -216,7 +225,11 @@  static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
 	while (length) {
 		u32 page_len = min_t(u32, length, PAGE_SIZE);
 
-		page = alloc_page(GFP_KERNEL);
+		if (p2pmem)
+			page = p2pmem_alloc_page(p2pmem);
+		else
+			page = alloc_page(GFP_KERNEL);
+
 		if (!page)
 			goto out_free_pages;
 
@@ -231,7 +244,10 @@  static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
 out_free_pages:
 	while (i > 0) {
 		i--;
-		__free_page(sg_page(&sg[i]));
+		if (p2pmem)
+			p2pmem_free_page(p2pmem, sg_page(&sg[i]));
+		else
+			__free_page(sg_page(&sg[i]));
 	}
 	kfree(sg);
 out:
@@ -484,7 +500,8 @@  static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
 	}
 
 	if (rsp->req.sg != &rsp->cmd->inline_sg)
-		nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
+		nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt,
+				    rsp->p2pmem);
 
 	if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
 		nvmet_rdma_process_wr_wait_list(queue);
@@ -625,8 +642,16 @@  static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 	if (!len)
 		return 0;
 
+	rsp->p2pmem = rsp->queue->p2pmem;
 	status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
-			len);
+			len, rsp->p2pmem);
+
+	if (status && rsp->p2pmem) {
+		rsp->p2pmem = NULL;
+		status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
+					      len, rsp->p2pmem);
+	}
+
 	if (status)
 		return status;
 
@@ -984,6 +1009,7 @@  static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
 				!queue->host_qid);
 	}
 	nvmet_rdma_free_rsps(queue);
+	p2pmem_put(queue->p2pmem);
 	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
 	kfree(queue);
 }
@@ -1179,6 +1205,52 @@  static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
 	return ret;
 }
 
+/*
+ * If allow_p2pmem is set, we will try to use P2P memory for our
+ * sgl lists. This requires the p2pmem device to be compatible with
+ * the backing device for every namespace this device will support.
+ * If not, we fall back on using system memory.
+ */
+static void nvmet_rdma_queue_setup_p2pmem(struct nvmet_rdma_queue *queue)
+{
+	struct device **dma_devs;
+	struct nvmet_ns *ns;
+	int ndevs = 1;
+	int i = 0;
+	struct nvmet_subsys_link *s;
+
+	if (!queue->port->allow_p2pmem)
+		return;
+
+	list_for_each_entry(s, &queue->port->subsystems, entry) {
+		list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
+			ndevs++;
+		}
+	}
+
+	dma_devs = kmalloc((ndevs + 1) * sizeof(*dma_devs), GFP_KERNEL);
+	if (!dma_devs)
+		return;
+
+	dma_devs[i++] = &queue->dev->device->dev;
+
+	list_for_each_entry(s, &queue->port->subsystems, entry) {
+		list_for_each_entry_rcu(ns, &s->subsys->namespaces, dev_link) {
+			dma_devs[i++] = disk_to_dev(ns->bdev->bd_disk);
+		}
+	}
+
+	dma_devs[i++] = NULL;
+
+	queue->p2pmem = p2pmem_find_compat(dma_devs);
+
+	if (queue->p2pmem)
+		pr_debug("using %s for rdma nvme target queue",
+			 dev_name(&queue->p2pmem->dev));
+
+	kfree(dma_devs);
+}
+
 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 		struct rdma_cm_event *event)
 {
@@ -1199,6 +1271,8 @@  static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 	}
 	queue->port = cm_id->context;
 
+	nvmet_rdma_queue_setup_p2pmem(queue);
+
 	ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
 	if (ret)
 		goto release_queue;