diff mbox

[net-next,v3,2/2] enic: use netdev_dma_alloc

Message ID 1426009384-11544-3-git-send-email-_govind@gmx.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Govindarajulu Varadarajan March 10, 2015, 5:43 p.m. UTC
This patches uses dma cache skb allocator fot rx buffers.

netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() and
netdev_dma_frag_unmap() happens in napi_poll and they are serialized.

Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
---
 drivers/net/ethernet/cisco/enic/enic_main.c | 31 +++++++++--------------------
 drivers/net/ethernet/cisco/enic/vnic_rq.c   |  3 +++
 drivers/net/ethernet/cisco/enic/vnic_rq.h   |  3 +++
 3 files changed, 15 insertions(+), 22 deletions(-)

Comments

Alexander Duyck March 10, 2015, 8:14 p.m. UTC | #1
On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote:
> This patches uses dma cache skb allocator fot rx buffers.
>
> netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() and
> netdev_dma_frag_unmap() happens in napi_poll and they are serialized.
>
> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>

This isn't going to work. The problem is the way you are using your 
fragments you can end up with a memory corruption as the frame headers 
that were updated by the stack may be reverted for any frames received 
before the last frame was unmapped.  I ran into that issue when I was 
doing page reuse with build_skb on the Intel drivers and I suspect you 
will see the same issue.

The way to work around it is to receive the data in to the fragments, 
and then pull the headers out and store them in a separate skb via 
something similar to copy-break.  You can then track the fragments in frags.

> ---
>   drivers/net/ethernet/cisco/enic/enic_main.c | 31 +++++++++--------------------
>   drivers/net/ethernet/cisco/enic/vnic_rq.c   |  3 +++
>   drivers/net/ethernet/cisco/enic/vnic_rq.h   |  3 +++
>   3 files changed, 15 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
> index 204bd182..3be5bc12 100644
> --- a/drivers/net/ethernet/cisco/enic/enic_main.c
> +++ b/drivers/net/ethernet/cisco/enic/enic_main.c
> @@ -952,13 +952,9 @@ nla_put_failure:
>   
>   static void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf)
>   {
> -	struct enic *enic = vnic_dev_priv(rq->vdev);
> -
>   	if (!buf->os_buf)
>   		return;
> -
> -	pci_unmap_single(enic->pdev, buf->dma_addr,
> -		buf->len, PCI_DMA_FROMDEVICE);
> +	netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
>   	dev_kfree_skb_any(buf->os_buf);
>   	buf->os_buf = NULL;
>   }
> @@ -979,17 +975,10 @@ static int enic_rq_alloc_buf(struct vnic_rq *rq)
>   
>   		return 0;
>   	}
> -	skb = netdev_alloc_skb_ip_align(netdev, len);
> +	skb = netdev_dma_alloc_skb(&rq->nc_head, &buf->nc, &dma_addr, len);
>   	if (!skb)
>   		return -ENOMEM;
>   
> -	dma_addr = pci_map_single(enic->pdev, skb->data, len,
> -				  PCI_DMA_FROMDEVICE);
> -	if (unlikely(enic_dma_map_check(enic, dma_addr))) {
> -		dev_kfree_skb(skb);
> -		return -ENOMEM;
> -	}
> -
>   	enic_queue_rq_desc(rq, skb, os_buf_index,
>   		dma_addr, len);
>   

I'm curious why you are still using skbs as your data type for receiving 
frames before they come in.  Why not just store a pointer to your dma 
buffer and hold off on allocating the sk_buff until you have actually 
received the frame in the buffer?  It would save you something like 256B 
per frame if you just hold off on the allocation until the skb is really 
needed.

> @@ -1016,8 +1005,6 @@ static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb,
>   	new_skb = netdev_alloc_skb_ip_align(netdev, len);
>   	if (!new_skb)
>   		return false;
> -	pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, len,
> -				    DMA_FROM_DEVICE);
>   	memcpy(new_skb->data, (*skb)->data, len);
>   	*skb = new_skb;
>   
> @@ -1065,8 +1052,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
>   				enic->rq_truncated_pkts++;
>   		}
>   
> -		pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
> -				 PCI_DMA_FROMDEVICE);
> +		netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
>   		dev_kfree_skb_any(skb);
>   		buf->os_buf = NULL;
>   
> @@ -1078,10 +1064,11 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
>   		/* Good receive
>   		 */
>   
> +		pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr,
> +					    bytes_written, DMA_FROM_DEVICE);
>   		if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) {
>   			buf->os_buf = NULL;
> -			pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
> -					 PCI_DMA_FROMDEVICE);
> +			netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
>   		}
>   		prefetch(skb->data - NET_IP_ALIGN);
>   

It looks like you already have copy-break code in your codepath.  It 
might be worth taking a look at what you would gain by deferring the skb 
allocation and using the copy-break code path to take care of small 
frames and headers for larger frames.

> @@ -1122,9 +1109,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
>   
>   		/* Buffer overflow
>   		 */
> -
> -		pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
> -				 PCI_DMA_FROMDEVICE);
> +		netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
>   		dev_kfree_skb_any(skb);
>   		buf->os_buf = NULL;
>   	}
> @@ -1648,6 +1633,8 @@ static int enic_open(struct net_device *netdev)
>   	}
>   
>   	for (i = 0; i < enic->rq_count; i++) {
> +		netdev_dma_init(&enic->rq[i].nc_head, &enic->pdev->dev,
> +				GFP_ATOMIC);
>   		vnic_rq_fill(&enic->rq[i], enic_rq_alloc_buf);
>   		/* Need at least one buffer on ring to get going */
>   		if (vnic_rq_desc_used(&enic->rq[i]) == 0) {
> diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c
> index 36a2ed6..afa1d71 100644
> --- a/drivers/net/ethernet/cisco/enic/vnic_rq.c
> +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c
> @@ -23,6 +23,7 @@
>   #include <linux/pci.h>
>   #include <linux/delay.h>
>   #include <linux/slab.h>
> +#include <linux/skbuff.h>
>   
>   #include "vnic_dev.h"
>   #include "vnic_rq.h"
> @@ -199,6 +200,8 @@ void vnic_rq_clean(struct vnic_rq *rq,
>   		rq->ring.desc_avail++;
>   	}
>   
> +	netdev_dma_destroy(&rq->nc_head);
> +
>   	/* Use current fetch_index as the ring starting point */
>   	fetch_index = ioread32(&rq->ctrl->fetch_index);
>   
> diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h
> index 8111d52..d4ee963 100644
> --- a/drivers/net/ethernet/cisco/enic/vnic_rq.h
> +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h
> @@ -21,6 +21,7 @@
>   #define _VNIC_RQ_H_
>   
>   #include <linux/pci.h>
> +#include <linux/skbuff.h>
>   
>   #include "vnic_dev.h"
>   #include "vnic_cq.h"
> @@ -73,6 +74,7 @@ struct vnic_rq_buf {
>   	unsigned int index;
>   	void *desc;
>   	uint64_t wr_id;
> +	struct netdev_dma_node *nc;
>   };
>   
>   struct vnic_rq {
> @@ -100,6 +102,7 @@ struct vnic_rq {
>   	unsigned int bpoll_state;
>   	spinlock_t bpoll_lock;
>   #endif /* CONFIG_NET_RX_BUSY_POLL */
> +	struct netdev_dma_head nc_head;
>   };
>   
>   static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Govindarajulu Varadarajan March 11, 2015, 9:27 a.m. UTC | #2
On Tue, 10 Mar 2015, Alexander Duyck wrote:

>
> On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote:
>> This patches uses dma cache skb allocator fot rx buffers.
>> 
>> netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() 
>> and
>> netdev_dma_frag_unmap() happens in napi_poll and they are serialized.
>> 
>> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
>
> This isn't going to work. The problem is the way you are using your fragments 
> you can end up with a memory corruption as the frame headers that were 
> updated by the stack may be reverted for any frames received before the last 
> frame was unmapped.  I ran into that issue when I was doing page reuse with 
> build_skb on the Intel drivers and I suspect you will see the same issue.
>

Is this behaviour platform dependent? I tested this patch for more than a month
and I did not face any issue. I ran normal traffic like ssh, nfs and iperf/netperf.
Is there a special scenario when this could occur?

Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this?
Each desc should have different dma address to write to. Can you explain me how
this can happen?

> The way to work around it is to receive the data in to the fragments, and 
> then pull the headers out and store them in a separate skb via something 
> similar to copy-break.  You can then track the fragments in frags.
>

If I split the pkt header into another frame, is it guaranteed that stack will
not modify the pkt data?

Thanks a lot for reviewing this patch.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck March 11, 2015, 2 p.m. UTC | #3
On 03/11/2015 02:27 AM, Govindarajulu Varadarajan wrote:
>
> On Tue, 10 Mar 2015, Alexander Duyck wrote:
>
>>
>> On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote:
>>> This patches uses dma cache skb allocator fot rx buffers.
>>>
>>> netdev_dma_head is initialized per rq. All calls to
>>> netdev_dma_alloc_skb() and
>>> netdev_dma_frag_unmap() happens in napi_poll and they are serialized.
>>>
>>> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
>>
>> This isn't going to work. The problem is the way you are using your
>> fragments you can end up with a memory corruption as the frame
>> headers that were updated by the stack may be reverted for any frames
>> received before the last frame was unmapped.  I ran into that issue
>> when I was doing page reuse with build_skb on the Intel drivers and I
>> suspect you will see the same issue.
>>
>
> Is this behaviour platform dependent? I tested this patch for more
> than a month
> and I did not face any issue. I ran normal traffic like ssh, nfs and
> iperf/netperf.
> Is there a special scenario when this could occur?

Yes it depends on the platform and IOMMU used.  For an example take a
loot at the SWIOTLB implementation.  I always assumed if I can work with
that when it is doing bounce buffers I can work with any IOMMU or platform.

>
> Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this?
> Each desc should have different dma address to write to. Can you
> explain me how
> this can happen?

No that won't help.  The issue is that when the page is mapped you
should not be updating any fields in the page until it is unmapped. 
Since you have multiple buffers mapped to a single page you should be
waiting until the entire page is unmapped.

>
>> The way to work around it is to receive the data in to the fragments,
>> and then pull the headers out and store them in a separate skb via
>> something similar to copy-break.  You can then track the fragments in
>> frags.
>>
>
> If I split the pkt header into another frame, is it guaranteed that
> stack will
> not modify the pkt data?

Paged fragments in the frags list use the page count to determine if
they can update it.  The problem is you cannot use a shared page as
skb->head if you plan to do any DMA mapping with it as it can cause
issues if you change any of the fields before it is unmapped.

>
> Thanks a lot for reviewing this patch.

No problem.  Just glad I saw this before you had to go though reverting
your stuff like I did.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight March 11, 2015, 5:34 p.m. UTC | #4
From: Alexander Duyck
...
> > Is this behaviour platform dependent? I tested this patch for more
> > than a month
> > and I did not face any issue. I ran normal traffic like ssh, nfs and
> > iperf/netperf.
> > Is there a special scenario when this could occur?
> 
> Yes it depends on the platform and IOMMU used.  For an example take a
> loot at the SWIOTLB implementation.  I always assumed if I can work with
> that when it is doing bounce buffers I can work with any IOMMU or platform.
> 
> >
> > Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this?
> > Each desc should have different dma address to write to. Can you
> > explain me how
> > this can happen?
> 
> No that won't help.  The issue is that when the page is mapped you
> should not be updating any fields in the page until it is unmapped.
> Since you have multiple buffers mapped to a single page you should be
> waiting until the entire page is unmapped.

Isn't the 'unit of memory for dma sync' a cache line, not a page?

You certainly need to test on systems without cache coherent io.

	David

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck March 11, 2015, 5:51 p.m. UTC | #5
On 03/11/2015 10:34 AM, David Laight wrote:
> From: Alexander Duyck
> ...
>>> Is this behaviour platform dependent? I tested this patch for more
>>> than a month
>>> and I did not face any issue. I ran normal traffic like ssh, nfs and
>>> iperf/netperf.
>>> Is there a special scenario when this could occur?
>> Yes it depends on the platform and IOMMU used.  For an example take a
>> loot at the SWIOTLB implementation.  I always assumed if I can work with
>> that when it is doing bounce buffers I can work with any IOMMU or platform.
>>
>>> Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this?
>>> Each desc should have different dma address to write to. Can you
>>> explain me how
>>> this can happen?
>> No that won't help.  The issue is that when the page is mapped you
>> should not be updating any fields in the page until it is unmapped.
>> Since you have multiple buffers mapped to a single page you should be
>> waiting until the entire page is unmapped.
> Isn't the 'unit of memory for dma sync' a cache line, not a page?

Yes, but the problem is the entire page is mapped, and unmapped and that
triggers a syncronization over the entire page, not just the most recent
buffer within the page that was used.

The problem is the API maps an order 3 page and then is using chunks of
it for receive buffers, but then the last buffer unmaps the entire page
which could invalidate any CPU side accesses to the page while it was
still mapped.

In order to make it workable it would have to be mapped bidirectional
and on the last unmap everything that isn't the last buffer would have
to be synced for device before the page is unmapped which would likely
be more expensive than just avoiding all of this by identifying the page
as being shared and cloning the header out of the page frag.

> You certainly need to test on systems without cache coherent io.
>
> 	David

I agree.

- Alex

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 204bd182..3be5bc12 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -952,13 +952,9 @@  nla_put_failure:
 
 static void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf)
 {
-	struct enic *enic = vnic_dev_priv(rq->vdev);
-
 	if (!buf->os_buf)
 		return;
-
-	pci_unmap_single(enic->pdev, buf->dma_addr,
-		buf->len, PCI_DMA_FROMDEVICE);
+	netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
 	dev_kfree_skb_any(buf->os_buf);
 	buf->os_buf = NULL;
 }
@@ -979,17 +975,10 @@  static int enic_rq_alloc_buf(struct vnic_rq *rq)
 
 		return 0;
 	}
-	skb = netdev_alloc_skb_ip_align(netdev, len);
+	skb = netdev_dma_alloc_skb(&rq->nc_head, &buf->nc, &dma_addr, len);
 	if (!skb)
 		return -ENOMEM;
 
-	dma_addr = pci_map_single(enic->pdev, skb->data, len,
-				  PCI_DMA_FROMDEVICE);
-	if (unlikely(enic_dma_map_check(enic, dma_addr))) {
-		dev_kfree_skb(skb);
-		return -ENOMEM;
-	}
-
 	enic_queue_rq_desc(rq, skb, os_buf_index,
 		dma_addr, len);
 
@@ -1016,8 +1005,6 @@  static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb,
 	new_skb = netdev_alloc_skb_ip_align(netdev, len);
 	if (!new_skb)
 		return false;
-	pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, len,
-				    DMA_FROM_DEVICE);
 	memcpy(new_skb->data, (*skb)->data, len);
 	*skb = new_skb;
 
@@ -1065,8 +1052,7 @@  static void enic_rq_indicate_buf(struct vnic_rq *rq,
 				enic->rq_truncated_pkts++;
 		}
 
-		pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
-				 PCI_DMA_FROMDEVICE);
+		netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
 		dev_kfree_skb_any(skb);
 		buf->os_buf = NULL;
 
@@ -1078,10 +1064,11 @@  static void enic_rq_indicate_buf(struct vnic_rq *rq,
 		/* Good receive
 		 */
 
+		pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr,
+					    bytes_written, DMA_FROM_DEVICE);
 		if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) {
 			buf->os_buf = NULL;
-			pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
-					 PCI_DMA_FROMDEVICE);
+			netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
 		}
 		prefetch(skb->data - NET_IP_ALIGN);
 
@@ -1122,9 +1109,7 @@  static void enic_rq_indicate_buf(struct vnic_rq *rq,
 
 		/* Buffer overflow
 		 */
-
-		pci_unmap_single(enic->pdev, buf->dma_addr, buf->len,
-				 PCI_DMA_FROMDEVICE);
+		netdev_dma_frag_unmap(&rq->nc_head, buf->nc);
 		dev_kfree_skb_any(skb);
 		buf->os_buf = NULL;
 	}
@@ -1648,6 +1633,8 @@  static int enic_open(struct net_device *netdev)
 	}
 
 	for (i = 0; i < enic->rq_count; i++) {
+		netdev_dma_init(&enic->rq[i].nc_head, &enic->pdev->dev,
+				GFP_ATOMIC);
 		vnic_rq_fill(&enic->rq[i], enic_rq_alloc_buf);
 		/* Need at least one buffer on ring to get going */
 		if (vnic_rq_desc_used(&enic->rq[i]) == 0) {
diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c
index 36a2ed6..afa1d71 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_rq.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c
@@ -23,6 +23,7 @@ 
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/skbuff.h>
 
 #include "vnic_dev.h"
 #include "vnic_rq.h"
@@ -199,6 +200,8 @@  void vnic_rq_clean(struct vnic_rq *rq,
 		rq->ring.desc_avail++;
 	}
 
+	netdev_dma_destroy(&rq->nc_head);
+
 	/* Use current fetch_index as the ring starting point */
 	fetch_index = ioread32(&rq->ctrl->fetch_index);
 
diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h
index 8111d52..d4ee963 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_rq.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h
@@ -21,6 +21,7 @@ 
 #define _VNIC_RQ_H_
 
 #include <linux/pci.h>
+#include <linux/skbuff.h>
 
 #include "vnic_dev.h"
 #include "vnic_cq.h"
@@ -73,6 +74,7 @@  struct vnic_rq_buf {
 	unsigned int index;
 	void *desc;
 	uint64_t wr_id;
+	struct netdev_dma_node *nc;
 };
 
 struct vnic_rq {
@@ -100,6 +102,7 @@  struct vnic_rq {
 	unsigned int bpoll_state;
 	spinlock_t bpoll_lock;
 #endif /* CONFIG_NET_RX_BUSY_POLL */
+	struct netdev_dma_head nc_head;
 };
 
 static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq)