diff mbox

[U-Boot,8/9] net: rtl8169: Use non-cached memory if available

Message ID 1408348852-30894-9-git-send-email-thierry.reding@gmail.com
State Superseded
Delegated to: Tom Rini
Headers show

Commit Message

Thierry Reding Aug. 18, 2014, 8 a.m. UTC
From: Thierry Reding <treding@nvidia.com>

To work around potential issues with explicit cache maintenance of the
RX and TX descriptor rings, allocate them from a pool of uncached memory
if the architecture supports it.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/net/rtl8169.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

Comments

Stephen Warren Aug. 20, 2014, 7:33 p.m. UTC | #1
On 08/18/2014 02:00 AM, Thierry Reding wrote:
> From: Thierry Reding <treding@nvidia.com>
>
> To work around potential issues with explicit cache maintenance of the
> RX and TX descriptor rings, allocate them from a pool of uncached memory
> if the architecture supports it.

> diff --git a/drivers/net/rtl8169.c b/drivers/net/rtl8169.c

> +#ifndef CONFIG_SYS_NONCACHED_MEMORY
>   /* Define the TX Descriptor */
>   DEFINE_ALIGN_BUFFER(struct TxDesc, tx_ring, NUM_TX_DESC, RTL8169_ALIGN);
>
>   /* Define the RX Descriptor */
>   DEFINE_ALIGN_BUFFER(struct RxDesc, rx_ring, NUM_RX_DESC, RTL8169_ALIGN);
> +#endif

It feels slightly inconsistent to have one case allocate this memory in 
BSS at compile-time, and in the other to allocate it dynamically.

Would it be better to remove this global, and simply call different APIs 
(regular aligned malloc, v.s. non-cached allocate) during rtl_init()?
Thierry Reding Aug. 22, 2014, 9:29 a.m. UTC | #2
On Wed, Aug 20, 2014 at 01:33:06PM -0600, Stephen Warren wrote:
> On 08/18/2014 02:00 AM, Thierry Reding wrote:
> >From: Thierry Reding <treding@nvidia.com>
> >
> >To work around potential issues with explicit cache maintenance of the
> >RX and TX descriptor rings, allocate them from a pool of uncached memory
> >if the architecture supports it.
> 
> >diff --git a/drivers/net/rtl8169.c b/drivers/net/rtl8169.c
> 
> >+#ifndef CONFIG_SYS_NONCACHED_MEMORY
> >  /* Define the TX Descriptor */
> >  DEFINE_ALIGN_BUFFER(struct TxDesc, tx_ring, NUM_TX_DESC, RTL8169_ALIGN);
> >
> >  /* Define the RX Descriptor */
> >  DEFINE_ALIGN_BUFFER(struct RxDesc, rx_ring, NUM_RX_DESC, RTL8169_ALIGN);
> >+#endif
> 
> It feels slightly inconsistent to have one case allocate this memory in BSS
> at compile-time, and in the other to allocate it dynamically.
> 
> Would it be better to remove this global, and simply call different APIs
> (regular aligned malloc, v.s. non-cached allocate) during rtl_init()?

I've reworked this a bit to use memalign() when non-cached memory isn't
available and factored out the descriptor allocation. I liked the code
slightly better before, but I guess consistently using dynamic memory
allocations is worth it. One potential downside is that memalign()
allocates from the malloc() pool, therefore devices using this driver
will now use more memory. I suppose they could even run out of memory
depending on how large the number of receive buffers becomes. Although
it's only the descriptors that are being dynamically allocated and they
are 16 bytes each. And it's not a problem that couldn't easily be solved
by increasing the malloc() size.

Thierry
diff mbox

Patch

diff --git a/drivers/net/rtl8169.c b/drivers/net/rtl8169.c
index c53666134e06..1c04946d7691 100644
--- a/drivers/net/rtl8169.c
+++ b/drivers/net/rtl8169.c
@@ -283,11 +283,31 @@  struct RxDesc {
 #  define RTL8169_ALIGN 256
 #endif
 
+/*
+ * TX and RX descriptors are 16 bytes. This causes problems with the cache
+ * maintenance on CPUs where the cache-line size exceeds the size of these
+ * descriptors. What will happen is that when the driver receives a packet
+ * it will be immediately requeued for the hardware to reuse. The CPU will
+ * therefore need to flush the cache-line containing the descriptor, which
+ * will cause all other descriptors in the same cache-line to be flushed
+ * along with it. If one of those descriptors had been written to by the
+ * device those changes (and the associated packet) will be lost.
+ *
+ * To work around this, we make use of non-cached memory if available. If
+ * descriptors are mapped uncached there's no need to manually flush them
+ * or invalidate them.
+ *
+ * Note that this only applies to descriptors. The packet data buffers do
+ * not have the same constraints since they are 1536 bytes large, so they
+ * are unlikely to share cache-lines.
+ */
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 /* Define the TX Descriptor */
 DEFINE_ALIGN_BUFFER(struct TxDesc, tx_ring, NUM_TX_DESC, RTL8169_ALIGN);
 
 /* Define the RX Descriptor */
 DEFINE_ALIGN_BUFFER(struct RxDesc, rx_ring, NUM_RX_DESC, RTL8169_ALIGN);
+#endif
 
 /*
  * Create a static buffer of size RX_BUF_SZ for each TX Descriptor. All
@@ -412,28 +432,36 @@  match:
 
 static void rtl_inval_rx_desc(struct RxDesc *desc)
 {
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 	unsigned long start = (unsigned long)desc & ~(ARCH_DMA_MINALIGN - 1);
 	unsigned long end = ALIGN(start + sizeof(*desc), ARCH_DMA_MINALIGN);
 
 	invalidate_dcache_range(start, end);
+#endif
 }
 
 static void rtl_flush_rx_desc(struct RxDesc *desc)
 {
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 	flush_cache((unsigned long)desc, sizeof(*desc));
+#endif
 }
 
 static void rtl_inval_tx_desc(struct TxDesc *desc)
 {
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 	unsigned long start = (unsigned long)desc & ~(ARCH_DMA_MINALIGN - 1);
 	unsigned long end = ALIGN(start + sizeof(*desc), ARCH_DMA_MINALIGN);
 
 	invalidate_dcache_range(start, end);
+#endif
 }
 
 static void rtl_flush_tx_desc(struct TxDesc *desc)
 {
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 	flush_cache((unsigned long)desc, sizeof(*desc));
+#endif
 }
 
 static void rtl_inval_buffer(void *buf, size_t size)
@@ -769,6 +797,9 @@  INIT - Look for an adapter, this routine's visible to the outside
 static int rtl_init(struct eth_device *dev, bd_t *bis)
 {
 	static int board_idx = -1;
+#ifdef CONFIG_SYS_NONCACHED_MEMORY
+	size_t size;
+#endif
 	int i, rc;
 	int option = -1, Cap10_100 = 0, Cap1000 = 0;
 
@@ -899,6 +930,7 @@  static int rtl_init(struct eth_device *dev, bd_t *bis)
 #endif
 	}
 
+#ifndef CONFIG_SYS_NONCACHED_MEMORY
 	/*
 	 * Warn if the cache-line size is larger than the descriptor size. In
 	 * such cases the driver will likely fail because the CPU needs to
@@ -910,6 +942,17 @@  static int rtl_init(struct eth_device *dev, bd_t *bis)
 
 	tpc->TxDescArray = tx_ring;
 	tpc->RxDescArray = rx_ring;
+#else
+	/*
+	 * When non-cached memory is available, allocate the descriptors from
+	 * an uncached memory region to avoid any problems caused by caching.
+	 */
+	size = NUM_TX_DESC * sizeof(struct TxDesc);
+	tpc->TxDescArray = (struct TxDesc *)noncached_alloc(size, 256);
+
+	size = NUM_RX_DESC * sizeof(struct RxDesc);
+	tpc->RxDescArray = (struct RxDesc *)noncached_alloc(size, 256);
+#endif
 
 	return 1;
 }