Message ID | 1426009384-11544-3-git-send-email-_govind@gmx.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote: > This patches uses dma cache skb allocator fot rx buffers. > > netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() and > netdev_dma_frag_unmap() happens in napi_poll and they are serialized. > > Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> This isn't going to work. The problem is the way you are using your fragments you can end up with a memory corruption as the frame headers that were updated by the stack may be reverted for any frames received before the last frame was unmapped. I ran into that issue when I was doing page reuse with build_skb on the Intel drivers and I suspect you will see the same issue. The way to work around it is to receive the data in to the fragments, and then pull the headers out and store them in a separate skb via something similar to copy-break. You can then track the fragments in frags. > --- > drivers/net/ethernet/cisco/enic/enic_main.c | 31 +++++++++-------------------- > drivers/net/ethernet/cisco/enic/vnic_rq.c | 3 +++ > drivers/net/ethernet/cisco/enic/vnic_rq.h | 3 +++ > 3 files changed, 15 insertions(+), 22 deletions(-) > > diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c > index 204bd182..3be5bc12 100644 > --- a/drivers/net/ethernet/cisco/enic/enic_main.c > +++ b/drivers/net/ethernet/cisco/enic/enic_main.c > @@ -952,13 +952,9 @@ nla_put_failure: > > static void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) > { > - struct enic *enic = vnic_dev_priv(rq->vdev); > - > if (!buf->os_buf) > return; > - > - pci_unmap_single(enic->pdev, buf->dma_addr, > - buf->len, PCI_DMA_FROMDEVICE); > + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); > dev_kfree_skb_any(buf->os_buf); > buf->os_buf = NULL; > } > @@ -979,17 +975,10 @@ static int enic_rq_alloc_buf(struct vnic_rq *rq) > > return 0; > } > - skb = netdev_alloc_skb_ip_align(netdev, len); > + skb = netdev_dma_alloc_skb(&rq->nc_head, &buf->nc, &dma_addr, len); > if (!skb) > return -ENOMEM; > > - dma_addr = pci_map_single(enic->pdev, skb->data, len, > - PCI_DMA_FROMDEVICE); > - if (unlikely(enic_dma_map_check(enic, dma_addr))) { > - dev_kfree_skb(skb); > - return -ENOMEM; > - } > - > enic_queue_rq_desc(rq, skb, os_buf_index, > dma_addr, len); > I'm curious why you are still using skbs as your data type for receiving frames before they come in. Why not just store a pointer to your dma buffer and hold off on allocating the sk_buff until you have actually received the frame in the buffer? It would save you something like 256B per frame if you just hold off on the allocation until the skb is really needed. > @@ -1016,8 +1005,6 @@ static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb, > new_skb = netdev_alloc_skb_ip_align(netdev, len); > if (!new_skb) > return false; > - pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, len, > - DMA_FROM_DEVICE); > memcpy(new_skb->data, (*skb)->data, len); > *skb = new_skb; > > @@ -1065,8 +1052,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, > enic->rq_truncated_pkts++; > } > > - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, > - PCI_DMA_FROMDEVICE); > + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); > dev_kfree_skb_any(skb); > buf->os_buf = NULL; > > @@ -1078,10 +1064,11 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, > /* Good receive > */ > > + pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, > + bytes_written, DMA_FROM_DEVICE); > if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) { > buf->os_buf = NULL; > - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, > - PCI_DMA_FROMDEVICE); > + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); > } > prefetch(skb->data - NET_IP_ALIGN); > It looks like you already have copy-break code in your codepath. It might be worth taking a look at what you would gain by deferring the skb allocation and using the copy-break code path to take care of small frames and headers for larger frames. > @@ -1122,9 +1109,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, > > /* Buffer overflow > */ > - > - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, > - PCI_DMA_FROMDEVICE); > + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); > dev_kfree_skb_any(skb); > buf->os_buf = NULL; > } > @@ -1648,6 +1633,8 @@ static int enic_open(struct net_device *netdev) > } > > for (i = 0; i < enic->rq_count; i++) { > + netdev_dma_init(&enic->rq[i].nc_head, &enic->pdev->dev, > + GFP_ATOMIC); > vnic_rq_fill(&enic->rq[i], enic_rq_alloc_buf); > /* Need at least one buffer on ring to get going */ > if (vnic_rq_desc_used(&enic->rq[i]) == 0) { > diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c > index 36a2ed6..afa1d71 100644 > --- a/drivers/net/ethernet/cisco/enic/vnic_rq.c > +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c > @@ -23,6 +23,7 @@ > #include <linux/pci.h> > #include <linux/delay.h> > #include <linux/slab.h> > +#include <linux/skbuff.h> > > #include "vnic_dev.h" > #include "vnic_rq.h" > @@ -199,6 +200,8 @@ void vnic_rq_clean(struct vnic_rq *rq, > rq->ring.desc_avail++; > } > > + netdev_dma_destroy(&rq->nc_head); > + > /* Use current fetch_index as the ring starting point */ > fetch_index = ioread32(&rq->ctrl->fetch_index); > > diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h > index 8111d52..d4ee963 100644 > --- a/drivers/net/ethernet/cisco/enic/vnic_rq.h > +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h > @@ -21,6 +21,7 @@ > #define _VNIC_RQ_H_ > > #include <linux/pci.h> > +#include <linux/skbuff.h> > > #include "vnic_dev.h" > #include "vnic_cq.h" > @@ -73,6 +74,7 @@ struct vnic_rq_buf { > unsigned int index; > void *desc; > uint64_t wr_id; > + struct netdev_dma_node *nc; > }; > > struct vnic_rq { > @@ -100,6 +102,7 @@ struct vnic_rq { > unsigned int bpoll_state; > spinlock_t bpoll_lock; > #endif /* CONFIG_NET_RX_BUSY_POLL */ > + struct netdev_dma_head nc_head; > }; > > static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 10 Mar 2015, Alexander Duyck wrote: > > On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote: >> This patches uses dma cache skb allocator fot rx buffers. >> >> netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() >> and >> netdev_dma_frag_unmap() happens in napi_poll and they are serialized. >> >> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> > > This isn't going to work. The problem is the way you are using your fragments > you can end up with a memory corruption as the frame headers that were > updated by the stack may be reverted for any frames received before the last > frame was unmapped. I ran into that issue when I was doing page reuse with > build_skb on the Intel drivers and I suspect you will see the same issue. > Is this behaviour platform dependent? I tested this patch for more than a month and I did not face any issue. I ran normal traffic like ssh, nfs and iperf/netperf. Is there a special scenario when this could occur? Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this? Each desc should have different dma address to write to. Can you explain me how this can happen? > The way to work around it is to receive the data in to the fragments, and > then pull the headers out and store them in a separate skb via something > similar to copy-break. You can then track the fragments in frags. > If I split the pkt header into another frame, is it guaranteed that stack will not modify the pkt data? Thanks a lot for reviewing this patch. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/11/2015 02:27 AM, Govindarajulu Varadarajan wrote: > > On Tue, 10 Mar 2015, Alexander Duyck wrote: > >> >> On 03/10/2015 10:43 AM, Govindarajulu Varadarajan wrote: >>> This patches uses dma cache skb allocator fot rx buffers. >>> >>> netdev_dma_head is initialized per rq. All calls to >>> netdev_dma_alloc_skb() and >>> netdev_dma_frag_unmap() happens in napi_poll and they are serialized. >>> >>> Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> >> >> This isn't going to work. The problem is the way you are using your >> fragments you can end up with a memory corruption as the frame >> headers that were updated by the stack may be reverted for any frames >> received before the last frame was unmapped. I ran into that issue >> when I was doing page reuse with build_skb on the Intel drivers and I >> suspect you will see the same issue. >> > > Is this behaviour platform dependent? I tested this patch for more > than a month > and I did not face any issue. I ran normal traffic like ssh, nfs and > iperf/netperf. > Is there a special scenario when this could occur? Yes it depends on the platform and IOMMU used. For an example take a loot at the SWIOTLB implementation. I always assumed if I can work with that when it is doing bounce buffers I can work with any IOMMU or platform. > > Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this? > Each desc should have different dma address to write to. Can you > explain me how > this can happen? No that won't help. The issue is that when the page is mapped you should not be updating any fields in the page until it is unmapped. Since you have multiple buffers mapped to a single page you should be waiting until the entire page is unmapped. > >> The way to work around it is to receive the data in to the fragments, >> and then pull the headers out and store them in a separate skb via >> something similar to copy-break. You can then track the fragments in >> frags. >> > > If I split the pkt header into another frame, is it guaranteed that > stack will > not modify the pkt data? Paged fragments in the frags list use the page count to determine if they can update it. The problem is you cannot use a shared page as skb->head if you plan to do any DMA mapping with it as it can cause issues if you change any of the fields before it is unmapped. > > Thanks a lot for reviewing this patch. No problem. Just glad I saw this before you had to go though reverting your stuff like I did. - Alex -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Alexander Duyck ... > > Is this behaviour platform dependent? I tested this patch for more > > than a month > > and I did not face any issue. I ran normal traffic like ssh, nfs and > > iperf/netperf. > > Is there a special scenario when this could occur? > > Yes it depends on the platform and IOMMU used. For an example take a > loot at the SWIOTLB implementation. I always assumed if I can work with > that when it is doing bounce buffers I can work with any IOMMU or platform. > > > > > Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this? > > Each desc should have different dma address to write to. Can you > > explain me how > > this can happen? > > No that won't help. The issue is that when the page is mapped you > should not be updating any fields in the page until it is unmapped. > Since you have multiple buffers mapped to a single page you should be > waiting until the entire page is unmapped. Isn't the 'unit of memory for dma sync' a cache line, not a page? You certainly need to test on systems without cache coherent io. David -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/11/2015 10:34 AM, David Laight wrote: > From: Alexander Duyck > ... >>> Is this behaviour platform dependent? I tested this patch for more >>> than a month >>> and I did not face any issue. I ran normal traffic like ssh, nfs and >>> iperf/netperf. >>> Is there a special scenario when this could occur? >> Yes it depends on the platform and IOMMU used. For an example take a >> loot at the SWIOTLB implementation. I always assumed if I can work with >> that when it is doing bounce buffers I can work with any IOMMU or platform. >> >>> Will using DMA_BIDIRECTIONAL and sync_to_cpu & sync_to_device solve this? >>> Each desc should have different dma address to write to. Can you >>> explain me how >>> this can happen? >> No that won't help. The issue is that when the page is mapped you >> should not be updating any fields in the page until it is unmapped. >> Since you have multiple buffers mapped to a single page you should be >> waiting until the entire page is unmapped. > Isn't the 'unit of memory for dma sync' a cache line, not a page? Yes, but the problem is the entire page is mapped, and unmapped and that triggers a syncronization over the entire page, not just the most recent buffer within the page that was used. The problem is the API maps an order 3 page and then is using chunks of it for receive buffers, but then the last buffer unmaps the entire page which could invalidate any CPU side accesses to the page while it was still mapped. In order to make it workable it would have to be mapped bidirectional and on the last unmap everything that isn't the last buffer would have to be synced for device before the page is unmapped which would likely be more expensive than just avoiding all of this by identifying the page as being shared and cloning the header out of the page frag. > You certainly need to test on systems without cache coherent io. > > David I agree. - Alex -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 204bd182..3be5bc12 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -952,13 +952,9 @@ nla_put_failure: static void enic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) { - struct enic *enic = vnic_dev_priv(rq->vdev); - if (!buf->os_buf) return; - - pci_unmap_single(enic->pdev, buf->dma_addr, - buf->len, PCI_DMA_FROMDEVICE); + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); dev_kfree_skb_any(buf->os_buf); buf->os_buf = NULL; } @@ -979,17 +975,10 @@ static int enic_rq_alloc_buf(struct vnic_rq *rq) return 0; } - skb = netdev_alloc_skb_ip_align(netdev, len); + skb = netdev_dma_alloc_skb(&rq->nc_head, &buf->nc, &dma_addr, len); if (!skb) return -ENOMEM; - dma_addr = pci_map_single(enic->pdev, skb->data, len, - PCI_DMA_FROMDEVICE); - if (unlikely(enic_dma_map_check(enic, dma_addr))) { - dev_kfree_skb(skb); - return -ENOMEM; - } - enic_queue_rq_desc(rq, skb, os_buf_index, dma_addr, len); @@ -1016,8 +1005,6 @@ static bool enic_rxcopybreak(struct net_device *netdev, struct sk_buff **skb, new_skb = netdev_alloc_skb_ip_align(netdev, len); if (!new_skb) return false; - pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, len, - DMA_FROM_DEVICE); memcpy(new_skb->data, (*skb)->data, len); *skb = new_skb; @@ -1065,8 +1052,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, enic->rq_truncated_pkts++; } - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, - PCI_DMA_FROMDEVICE); + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); dev_kfree_skb_any(skb); buf->os_buf = NULL; @@ -1078,10 +1064,11 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, /* Good receive */ + pci_dma_sync_single_for_cpu(enic->pdev, buf->dma_addr, + bytes_written, DMA_FROM_DEVICE); if (!enic_rxcopybreak(netdev, &skb, buf, bytes_written)) { buf->os_buf = NULL; - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, - PCI_DMA_FROMDEVICE); + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); } prefetch(skb->data - NET_IP_ALIGN); @@ -1122,9 +1109,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, /* Buffer overflow */ - - pci_unmap_single(enic->pdev, buf->dma_addr, buf->len, - PCI_DMA_FROMDEVICE); + netdev_dma_frag_unmap(&rq->nc_head, buf->nc); dev_kfree_skb_any(skb); buf->os_buf = NULL; } @@ -1648,6 +1633,8 @@ static int enic_open(struct net_device *netdev) } for (i = 0; i < enic->rq_count; i++) { + netdev_dma_init(&enic->rq[i].nc_head, &enic->pdev->dev, + GFP_ATOMIC); vnic_rq_fill(&enic->rq[i], enic_rq_alloc_buf); /* Need at least one buffer on ring to get going */ if (vnic_rq_desc_used(&enic->rq[i]) == 0) { diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c index 36a2ed6..afa1d71 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_rq.c +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c @@ -23,6 +23,7 @@ #include <linux/pci.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/skbuff.h> #include "vnic_dev.h" #include "vnic_rq.h" @@ -199,6 +200,8 @@ void vnic_rq_clean(struct vnic_rq *rq, rq->ring.desc_avail++; } + netdev_dma_destroy(&rq->nc_head); + /* Use current fetch_index as the ring starting point */ fetch_index = ioread32(&rq->ctrl->fetch_index); diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.h b/drivers/net/ethernet/cisco/enic/vnic_rq.h index 8111d52..d4ee963 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_rq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_rq.h @@ -21,6 +21,7 @@ #define _VNIC_RQ_H_ #include <linux/pci.h> +#include <linux/skbuff.h> #include "vnic_dev.h" #include "vnic_cq.h" @@ -73,6 +74,7 @@ struct vnic_rq_buf { unsigned int index; void *desc; uint64_t wr_id; + struct netdev_dma_node *nc; }; struct vnic_rq { @@ -100,6 +102,7 @@ struct vnic_rq { unsigned int bpoll_state; spinlock_t bpoll_lock; #endif /* CONFIG_NET_RX_BUSY_POLL */ + struct netdev_dma_head nc_head; }; static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq)
This patches uses dma cache skb allocator fot rx buffers. netdev_dma_head is initialized per rq. All calls to netdev_dma_alloc_skb() and netdev_dma_frag_unmap() happens in napi_poll and they are serialized. Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> --- drivers/net/ethernet/cisco/enic/enic_main.c | 31 +++++++++-------------------- drivers/net/ethernet/cisco/enic/vnic_rq.c | 3 +++ drivers/net/ethernet/cisco/enic/vnic_rq.h | 3 +++ 3 files changed, 15 insertions(+), 22 deletions(-)