Patchwork [U-Boot,V2] net: fec_mxc: allow use with cache enabled

login
register
mail settings
Submitter Eric Nelson
Date March 5, 2012, 6:36 p.m.
Message ID <1330972603-23782-2-git-send-email-eric.nelson@boundarydevices.com>
Download mbox | patch
Permalink /patch/144742/
State Superseded
Headers show

Comments

Eric Nelson - March 5, 2012, 6:36 p.m.
ensure that transmit and receive buffers are cache-line aligned
        invalidate cache after each packet received
        flush cache before transmitting

	Original patch by Marek:
		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html

Signed-off-by: Eric Nelson <eric.nelson@boundarydevices.com>
---
 drivers/net/fec_mxc.c |  243 +++++++++++++++++++++++++++++++++++++------------
 drivers/net/fec_mxc.h |   19 +----
 2 files changed, 184 insertions(+), 78 deletions(-)
Marek Vasut - March 5, 2012, 8:06 p.m.
Dear Eric Nelson,

> 	ensure that transmit and receive buffers are cache-line aligned
>         invalidate cache after each packet received
>         flush cache before transmitting
> 
> 	Original patch by Marek:
> 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html

Would be cool to Cc me :-p

All right, let's review this!

> 
> Signed-off-by: Eric Nelson <eric.nelson@boundarydevices.com>
> ---
>  drivers/net/fec_mxc.c |  243
> +++++++++++++++++++++++++++++++++++++------------ drivers/net/fec_mxc.h | 
>  19 +----
>  2 files changed, 184 insertions(+), 78 deletions(-)
> 
> diff --git a/drivers/net/fec_mxc.c b/drivers/net/fec_mxc.c
> index 1fdd071..0e35377 100644
> --- a/drivers/net/fec_mxc.c
> +++ b/drivers/net/fec_mxc.c
> @@ -50,6 +50,33 @@ DECLARE_GLOBAL_DATA_PTR;
>  #define	CONFIG_FEC_MXC_SWAP_PACKET
>  #endif
> 
> +#ifndef	CONFIG_FEC_DESC_ALIGNMENT
> +#define	CONFIG_FEC_DESC_ALIGNMENT	ARCH_DMA_MINALIGN
> +#endif
> +
> +#ifndef	CONFIG_FEC_DATA_ALIGNMENT
> +#define	CONFIG_FEC_DATA_ALIGNMENT	ARCH_DMA_MINALIGN
> +#endif

The above shoul be the bigger of CONFIG_SYS_CACHELINE_SIZE and ARCH_DMA_ALIGN 
(use max() ).

Eventually, we should unify cache and DMA stuff and make only one macro (ALIGN) 
and be done with it.

btw. I can't seem to find a platform using different alignment for DATA and 
DESC, let's make it one ( ... #define CONFIG_FEC_ALIGN max(ARCH_DMA_MINALIGN, 
CONFIG_SYS_CACHELINE_SIZE), ok?

> +
> +/* Check various alignment issues at compile time */
> +#if ((CONFIG_FEC_DESC_ALIGNMENT < 16) || (CONFIG_FEC_DESC_ALIGNMENT % 16
> != 0)) +#error	"CONFIG_FEC_DESC_ALIGNMENT must be multiple of 16!"
> +#endif
> +
> +#if ((CONFIG_FEC_DATA_ALIGNMENT < 16) || (CONFIG_FEC_DATA_ALIGNMENT % 16
> != 0)) +#error	"CONFIG_FEC_DATA_ALIGNMENT must be multiple of 16!"
> +#endif
> +
> +#if ((PKTALIGN < CONFIG_FEC_DATA_ALIGNMENT) || \
> +	(PKTALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
> +#error	"PKTALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
> +#endif
> +
> +#if ((PKTSIZE_ALIGN < CONFIG_FEC_DATA_ALIGNMENT) || \
> +	(PKTSIZE_ALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
> +#error	"PKTSIZE_ALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
> +#endif
> +
>  #undef DEBUG
> 
>  struct nbuf {
> @@ -259,43 +286,52 @@ static int fec_tx_task_disable(struct fec_priv *fec)
>   * Initialize receive task's buffer descriptors
>   * @param[in] fec all we know about the device yet
>   * @param[in] count receive buffer count to be allocated
> - * @param[in] size size of each receive buffer
> + * @param[in] dsize desired size of each receive buffer
>   * @return 0 on success
>   *
>   * For this task we need additional memory for the data buffers. And each
>   * data buffer requires some alignment. Thy must be aligned to a specific
> - * boundary each (DB_DATA_ALIGNMENT).
> + * boundary each.
>   */
> -static int fec_rbd_init(struct fec_priv *fec, int count, int size)
> +static int fec_rbd_init(struct fec_priv *fec, int count, int dsize)
>  {
> -	int ix;
> -	uint32_t p = 0;
> -
> -	/* reserve data memory and consider alignment */
> -	if (fec->rdb_ptr == NULL)
> -		fec->rdb_ptr = malloc(size * count + DB_DATA_ALIGNMENT);
> -	p = (uint32_t)fec->rdb_ptr;
> -	if (!p) {
> -		puts("fec_mxc: not enough malloc memory\n");
> -		return -ENOMEM;
> -	}
> -	memset((void *)p, 0, size * count + DB_DATA_ALIGNMENT);
> -	p += DB_DATA_ALIGNMENT-1;
> -	p &= ~(DB_DATA_ALIGNMENT-1);
> -
> -	for (ix = 0; ix < count; ix++) {
> -		writel(p, &fec->rbd_base[ix].data_pointer);
> -		p += size;
> -		writew(FEC_RBD_EMPTY, &fec->rbd_base[ix].status);
> -		writew(0, &fec->rbd_base[ix].data_length);
> -	}
> +	uint32_t size;
> +	int i;
> +
>  	/*
> -	 * mark the last RBD to close the ring
> +	 * Allocate memory for the buffers. This allocation respects the
> +	 * alignment
>  	 */
> -	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY, &fec->rbd_base[ix - 1].status);
> +	size = roundup(dsize, CONFIG_FEC_DATA_ALIGNMENT);
> +	for (i = 0; i < count; i++) {
> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
> +		if (0 == data_ptr) {

We should fix this yoda condition while at it.

> +			uint8_t *data = memalign(CONFIG_FEC_DATA_ALIGNMENT,
> +						 size);

NOTE[1]: This is good, each RECEIVE DATA BUFFER is properly aligned at 
allocation time and therefore IS CACHE SAFE.

> +			if (!data) {
> +				printf("%s: error allocating rxbuf %d\n",
> +				       __func__, i);
> +				goto err;
> +			}
> +			writel((uint32_t)data, &fec->rbd_base[i].data_pointer);

I think this "writel()" call is bogus and should be removed in subsequent patch 
and replaced with simple assignment. It was here probably due to cache issues on 
PPC?

> +		} /* needs allocation */
> +		writew(FEC_RBD_EMPTY, &fec->rbd_base[i].status);
> +		writew(0, &fec->rbd_base[i].data_length);
> +	}
> +
> +	/* Mark the last RBD to close the ring. */
> +	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY, &fec->rbd_base[i - 1].status);
>  	fec->rbd_index = 0;
> 
>  	return 0;
> +
> +err:
> +	for (; i >= 0; i--) {
> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
> +		free((void *)data_ptr);
> +	}
> +
> +	return -ENOMEM;
>  }
> 
>  /**
> @@ -387,12 +423,25 @@ static int fec_open(struct eth_device *edev)
>  {
>  	struct fec_priv *fec = (struct fec_priv *)edev->priv;
>  	int speed;
> +	uint32_t addr, size;
> +	int i;
> 
>  	debug("fec_open: fec_open(dev)\n");
>  	/* full-duplex, heartbeat disabled */
>  	writel(1 << 2, &fec->eth->x_cntrl);
>  	fec->rbd_index = 0;
> 
> +	/* Invalidate all descriptors */
> +	for (i = 0; i < FEC_RBD_NUM - 1; i++)
> +		fec_rbd_clean(0, &fec->rbd_base[i]);
> +	fec_rbd_clean(1, &fec->rbd_base[i]);
> +
> +	/* Flush the descriptors into RAM */
> +	size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
> +			CONFIG_FEC_DATA_ALIGNMENT);
> +	addr = (uint32_t)fec->rbd_base;
> +	flush_dcache_range(addr, addr + size);
> +
>  #ifdef FEC_QUIRK_ENET_MAC
>  	/* Enable ENET HW endian SWAP */
>  	writel(readl(&fec->eth->ecntrl) | FEC_ECNTRL_DBSWAP,
> @@ -478,38 +527,44 @@ static int fec_open(struct eth_device *edev)
> 
>  static int fec_init(struct eth_device *dev, bd_t* bd)
>  {
> -	uint32_t base;
>  	struct fec_priv *fec = (struct fec_priv *)dev->priv;
>  	uint32_t mib_ptr = (uint32_t)&fec->eth->rmon_t_drop;
>  	uint32_t rcntrl;
> -	int i;
> +	uint32_t size;
> +	int i, ret;
> 
>  	/* Initialize MAC address */
>  	fec_set_hwaddr(dev);
> 
>  	/*
> -	 * reserve memory for both buffer descriptor chains at once
> -	 * Datasheet forces the startaddress of each chain is 16 byte
> -	 * aligned
> +	 * Allocate transmit descriptors, there are two in total. This
> +	 * allocation respects cache alignment.
>  	 */
> -	if (fec->base_ptr == NULL)
> -		fec->base_ptr = malloc((2 + FEC_RBD_NUM) *
> -				sizeof(struct fec_bd) + DB_ALIGNMENT);
> -	base = (uint32_t)fec->base_ptr;
> -	if (!base) {
> -		puts("fec_mxc: not enough malloc memory\n");
> -		return -ENOMEM;
> +	if (!fec->tbd_base) {
> +		size = roundup(2 * sizeof(struct fec_bd),
> +				CONFIG_FEC_DATA_ALIGNMENT);
> +		fec->tbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);

NOTE[2]: This allocates exactly 2 * 8 bytes == 2 * sizeof(struct fec_bd). This 
means on systems with cache size of 32 bytes, we need to be especially careful. 
This matter is handled by the up-alignment of this data, so this part IS CACHE 
SAFE.

> +		if (!fec->tbd_base) {
> +			ret = -ENOMEM;
> +			goto err1;
> +		}
> +		memset(fec->tbd_base, 0, size);

We might want to flush the descriptors to memory after they have been inited?

>  	}
> -	memset((void *)base, 0, (2 + FEC_RBD_NUM) *
> -			sizeof(struct fec_bd) + DB_ALIGNMENT);
> -	base += (DB_ALIGNMENT-1);
> -	base &= ~(DB_ALIGNMENT-1);
> -
> -	fec->rbd_base = (struct fec_bd *)base;
> 
> -	base += FEC_RBD_NUM * sizeof(struct fec_bd);
> -
> -	fec->tbd_base = (struct fec_bd *)base;
> +	/*
> +	 * Allocate receive descriptors. This allocation respects cache
> +	 * alignment.
> +	 */
> +	if (!fec->rbd_base) {
> +		size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
> +				CONFIG_FEC_DATA_ALIGNMENT);
> +		fec->rbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);

NOTE[3]: This allocates exactly 64 * 8 bytes == 64 * sizeof(struct fec_bd). This 
means on systems with cache size of 32 bytes, we need to be especially careful. 
This matter is handled by the up-alignment of this data, so this part IS CACHE 
SAFE.

> +		if (!fec->rbd_base) {
> +			ret = -ENOMEM;
> +			goto err2;
> +		}
> +		memset(fec->rbd_base, 0, size);
> +	}

We might want to flush the descriptors to memory after they have been inited?
> 
>  	/*
>  	 * Set interrupt mask register
> @@ -570,9 +625,8 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>  	 * Initialize RxBD/TxBD rings
>  	 */
>  	if (fec_rbd_init(fec, FEC_RBD_NUM, FEC_MAX_PKT_SIZE) < 0) {
> -		free(fec->base_ptr);
> -		fec->base_ptr = NULL;
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto err3;
>  	}
>  	fec_tbd_init(fec);
> 
> @@ -583,6 +637,13 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>  #endif
>  	fec_open(dev);
>  	return 0;
> +
> +err3:
> +	free(fec->rbd_base);
> +err2:
> +	free(fec->tbd_base);
> +err1:
> +	return ret;
>  }
> 

#####
To sum it up, so far, we have allocated and properly aligned both the 
descriptors and the receive data buffers.

Important remark, there will be likely more descriptors in one cacheline!
#####

>  /**
> @@ -631,9 +692,11 @@ static void fec_halt(struct eth_device *dev)
>   * @param[in] length Data count in bytes
>   * @return 0 on success
>   */
> -static int fec_send(struct eth_device *dev, volatile void* packet, int
> length) +static int fec_send(struct eth_device *dev, void *packet, int
> length) {
>  	unsigned int status;
> +	uint32_t size;
> +	uint32_t addr;
> 
>  	/*
>  	 * This routine transmits one frame.  This routine only accepts
> @@ -650,15 +713,21 @@ static int fec_send(struct eth_device *dev, volatile
> void* packet, int length) }
> 
>  	/*
> -	 * Setup the transmit buffer
> -	 * Note: We are always using the first buffer for transmission,
> -	 * the second will be empty and only used to stop the DMA engine
> +	 * Setup the transmit buffer. We are always using the first buffer for
> +	 * transmission, the second will be empty and only used to stop the DMA
> +	 * engine. We also flush the packet to RAM here to avoid cache trouble.
>  	 */
>  #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
>  	swap_packet((uint32_t *)packet, length);
>  #endif
> +
> +	addr = (uint32_t)packet;
> +	size = roundup(length, CONFIG_FEC_DATA_ALIGNMENT);
> +	flush_dcache_range(addr, addr + size);
> +

ERROR[1]: The above seems wrong, it might flush more than necessary. Drop the 
roundup() call here so flush_dcache_range() can scream about cache issues and so 
we can find them.

>  	writew(length, &fec->tbd_base[fec->tbd_index].data_length);
> -	writel((uint32_t)packet, &fec->tbd_base[fec->tbd_index].data_pointer);
> +	writel(addr, &fec->tbd_base[fec->tbd_index].data_pointer);
> +
>  	/*
>  	 * update BD's status now
>  	 * This block:
> @@ -672,16 +741,30 @@ static int fec_send(struct eth_device *dev, volatile
> void* packet, int length) writew(status,
> &fec->tbd_base[fec->tbd_index].status);
> 
>  	/*
> +	 * Flush data cache. This code flushes both TX descriptors to RAM.
> +	 * After this code this code, the descritors will be safely in RAM

Please fix "this code this code".

> +	 * and we can start DMA.
> +	 */
> +	size = roundup(2 * sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
> +	addr = (uint32_t)fec->tbd_base;
> +	flush_dcache_range(addr, addr + size);

NOTE[4]: This is correct, we must flush both TX descriptors here.

> +
> +	/*
>  	 * Enable SmartDMA transmit task
>  	 */
>  	fec_tx_task_enable(fec);
> 
>  	/*
> -	 * wait until frame is sent .
> +	 * Wait until frame is sent. On each turn of the wait cycle, we must
> +	 * invalidate data cache to see what's really in RAM. Also, we need
> +	 * barrier here.
>  	 */
> +	invalidate_dcache_range(addr, addr + size);
>  	while (readw(&fec->tbd_base[fec->tbd_index].status) & FEC_TBD_READY) {
>  		udelay(1);
> +		invalidate_dcache_range(addr, addr + size);
>  	}

NOTE[5]: This is correct, we must invalidate cache every time we want to read 
fresh version of the descriptor.

> +
>  	debug("fec_send: status 0x%x index %d\n",
>  			readw(&fec->tbd_base[fec->tbd_index].status),
>  			fec->tbd_index);
> @@ -707,6 +790,8 @@ static int fec_recv(struct eth_device *dev)
>  	int frame_length, len = 0;
>  	struct nbuf *frame;
>  	uint16_t bd_status;
> +	uint32_t addr, size;
> +	int i;
>  	uchar buff[FEC_MAX_PKT_SIZE];
> 
>  	/*
> @@ -737,8 +822,23 @@ static int fec_recv(struct eth_device *dev)
>  	}
> 
>  	/*
> -	 * ensure reading the right buffer status
> +	 * Read the buffer status. Before the status can be read, the data cache
> +	 * must be invalidated, because the data in RAM might have been changed
> +	 * by DMA. The descriptors are properly aligned to cachelines so there's
> +	 * no need to worry they'd overlap.
> +	 *
> +	 * WARNING: By invalidating the descriptor here, we also invalidate
> +	 * the descriptors surrounding this one. Therefore we can NOT change the
> +	 * contents of this descriptor nor the surrounding ones. The problem is
> +	 * that in order to mark the descriptor as processed, we need to change
> +	 * the descriptor. The solution is to mark the whole cache line when all
> +	 * descriptors in the cache line are processed.
>  	 */
> +	addr = (uint32_t)rbd;
> +	addr &= ~(CONFIG_FEC_DATA_ALIGNMENT - 1);

Aka some kind of round_down()

> +	size = roundup(sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);

Aka roundup.

> +	invalidate_dcache_range(addr, addr + size);
> +

The idea here is the following (demo uses 32byte long cachelines):

[DESC1][DESC2][DESC3][DESC4][DESC5][DESC6]
`------- cacheline --------'

We want to start retrieving contents of DESC3, therefore "addr" points to DESC1, 
"size" is size of cacheline (I hope there's no hardware with 8 byte cachelines, 
but this should be ok here).

NOTE[5]: Here we can invalidate the whole cacheline, because the descriptors so 
far were modified only be hardware, not by us. We are not writing anything there 
so we won't loose any information.

NOTE[6]: This invalidation ensures that we always have a fresh copy of the 
cacheline containing all the descriptors, therefore we always have a fresh 
status of the descriptors we are about to pick. Since this is a sequential 
execution, the cache eviction should not kick in here (or might it?).


>  	bd_status = readw(&rbd->status);
>  	debug("fec_recv: status 0x%x\n", bd_status);
> 
> @@ -751,6 +851,13 @@ static int fec_recv(struct eth_device *dev)
>  			frame = (struct nbuf *)readl(&rbd->data_pointer);
>  			frame_length = readw(&rbd->data_length) - 4;

NOTE[7]: We received a frame, we know how long it is. The frame is loaded into 
the rbd->data_pointer, which IS CACHE ALIGNED.

I detect a problem here that frame_length might overflow, but that's not related 
to caches and might be just false accusation.

>  			/*
> +			 * Invalidate data cache over the buffer
> +			 */
> +			addr = (uint32_t)frame;
> +			size = roundup(frame_length, CONFIG_FEC_DATA_ALIGNMENT);
> +			invalidate_dcache_range(addr, addr + size);

NOTE[8]: The roundup here is a valid operation, we can flush up to the size of 
rbd->data_pointer, which is cache aligned and considering frame_length is less 
or equal to the memory allocated for rbd->data_pointer, the invalidation of 
cache here IS SAFE.

> +
> +			/*
>  			 *  Fill the buffer and pass it to upper layers
>  			 */
>  #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
> @@ -765,11 +872,27 @@ static int fec_recv(struct eth_device *dev)
>  						(ulong)rbd->data_pointer,
>  						bd_status);
>  		}
> +
>  		/*
> -		 * free the current buffer, restart the engine
> -		 * and move forward to the next buffer
> +		 * Free the current buffer, restart the engine and move forward
> +		 * to the next buffer. Here we check if the whole cacheline of
> +		 * descriptors was already processed and if so, we mark it free
> +		 * as whole.
>  		 */

NOTE[9]: This is the most problematic part, handling the marking of RX 
descriptors with a flag that they are ready again. Obviously, directly writing 
to the desciptor won't work, because:

1) There are multiple descriptors on a cacheline, therefore by writing one, we'd 
need to flush the whole cacheline back into DRAM immediatelly so the hardware 
knows about it. But that'd mean we can loose some information from the hardware, 
refer to demo before NOTE[5]:

We modify DESC2 and hardware is concurently changing DESC3. We flush DESC2 and 
the whole cacheline, we loose part of DESC3.

2) Cache eviction algorithm might flush data for us, therefore causing loss of 
information as well.

The solution is the following:

1) Compute how many descriptors are per-cache line
2) Make sure FEC_RBD_NUM * sizeof(struct fec_bd) is at least 2 * 
CONFIG_FEC_DATA_ALIGNMENT in size, see NOTE[11].
3) Once the last RX buffer in the cacheline is processed, mark them all clean 
and flush them all, see NOTE[10].

NOTE[10]: This is legal, because the hardware won't use RX descriptors that it 
marked dirty (which means not picked up by software yet). We clean the 
desciptors in an order the hardware would pick them up again so there's no 
problem with race condition either. The only possible issue here is if there was 
hardware with cacheline size smaller than descriptor size (we should add a check 
for this at the begining of the file).

NOTE[11]: This is because we want the FEC to overwrite descriptors below the 
other cacheline while we're marking the one containing retrieved descriptors 
clean.

> -		fec_rbd_clean(fec->rbd_index == (FEC_RBD_NUM - 1) ? 1 : 0, rbd);
> +		size = (CONFIG_FEC_DATA_ALIGNMENT / sizeof(struct fec_bd)) - 1;
> +		if ((fec->rbd_index & size) == size) {
> +			i = fec->rbd_index - size;
> +			addr = (uint32_t)&fec->rbd_base[i];
> +			for (; i <= fec->rbd_index + size; i++) {
> +				if (i == (FEC_RBD_NUM - 1))
> +					fec_rbd_clean(1, &fec->rbd_base[i]);
> +				else
> +					fec_rbd_clean(0, &fec->rbd_base[i]);
> +			}
> +			flush_dcache_range(addr,
> +				addr + CONFIG_FEC_DATA_ALIGNMENT);
> +		}
> +
>  		fec_rx_task_enable(fec);
>  		fec->rbd_index = (fec->rbd_index + 1) % FEC_RBD_NUM;
>  	}
> diff --git a/drivers/net/fec_mxc.h b/drivers/net/fec_mxc.h
> index 2eb7803..852b2e0 100644
> --- a/drivers/net/fec_mxc.h
> +++ b/drivers/net/fec_mxc.h
> @@ -234,22 +234,6 @@ struct ethernet_regs {
>  #endif
> 
>  /**
> - * @brief Descriptor buffer alignment
> - *
> - * i.MX27 requires a 16 byte alignment (but for the first element only)
> - */
> -#define DB_ALIGNMENT		16
> -
> -/**
> - * @brief Data buffer alignment
> - *
> - * i.MX27 requires a four byte alignment for transmit and 16 bits
> - * alignment for receive so take 16
> - * Note: Valid for member data_pointer in struct buffer_descriptor
> - */
> -#define DB_DATA_ALIGNMENT	16
> -
> -/**
>   * @brief Receive & Transmit Buffer Descriptor definitions
>   *
>   * Note: The first BD must be aligned (see DB_ALIGNMENT)
> @@ -282,8 +266,7 @@ struct fec_priv {
>  	struct fec_bd *tbd_base;	/* TBD ring */
>  	int tbd_index;			/* next transmit BD to write */
>  	bd_t *bd;
> -	void *rdb_ptr;
> -	void *base_ptr;
> +	uint8_t *tdb_ptr;
>  	int dev_id;
>  	int phy_id;
>  	struct mii_dev *bus;
Uh, this was tough.
Wolfgang Denk - March 5, 2012, 8:19 p.m.
Dear Marek Vasut,

In message <201203052106.42334.marex@denx.de> you wrote:
> Dear Eric Nelson,
> 
> > 	ensure that transmit and receive buffers are cache-line aligned
> >         invalidate cache after each packet received
> >         flush cache before transmitting
> > 
> > 	Original patch by Marek:
> > 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html
> 
> Would be cool to Cc me :-p

Actually this should happen automatically, because your Signed-off-by:
line shouldbe there ??


Best regards,

Wolfgang Denk
Marek Vasut - March 5, 2012, 8:23 p.m.
Dear Wolfgang Denk,

> Dear Marek Vasut,
> 
> In message <201203052106.42334.marex@denx.de> you wrote:
> > Dear Eric Nelson,
> > 
> > > 	ensure that transmit and receive buffers are cache-line aligned
> > > 	
> > >         invalidate cache after each packet received
> > >         flush cache before transmitting
> > > 	
> > > 	Original patch by Marek:
> > > 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html
> > 
> > Would be cool to Cc me :-p
> 
> Actually this should happen automatically, because your Signed-off-by:
> line shouldbe there ??
> 

You mean anywhere close to that piece of crap code I produced? I was ashamed to 
put it there ;-) But now that it stared looking humane, I'd maybe like to get a 
tiny bit of credit too ;-)

> 
> Best regards,
> 
> Wolfgang Denk

Best regards,
Marek Vasut
Eric Nelson - March 6, 2012, 5:06 p.m.
On 03/05/2012 01:06 PM, Marek Vasut wrote:
> Dear Eric Nelson,
>
>> 	ensure that transmit and receive buffers are cache-line aligned
>>          invalidate cache after each packet received
>>          flush cache before transmitting
>>
>> 	Original patch by Marek:
>> 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html
>
> Would be cool to Cc me :-p
>

Sorry about that.

> All right, let's review this!
>
>>
>> Signed-off-by: Eric Nelson<eric.nelson@boundarydevices.com>
>> ---
>>   drivers/net/fec_mxc.c |  243
>> +++++++++++++++++++++++++++++++++++++------------ drivers/net/fec_mxc.h |
>>   19 +----
>>   2 files changed, 184 insertions(+), 78 deletions(-)
>>
>> diff --git a/drivers/net/fec_mxc.c b/drivers/net/fec_mxc.c
>> index 1fdd071..0e35377 100644
>> --- a/drivers/net/fec_mxc.c
>> +++ b/drivers/net/fec_mxc.c
>> @@ -50,6 +50,33 @@ DECLARE_GLOBAL_DATA_PTR;
>>   #define	CONFIG_FEC_MXC_SWAP_PACKET
>>   #endif
>>
>> +#ifndef	CONFIG_FEC_DESC_ALIGNMENT
>> +#define	CONFIG_FEC_DESC_ALIGNMENT	ARCH_DMA_MINALIGN
>> +#endif
>> +
>> +#ifndef	CONFIG_FEC_DATA_ALIGNMENT
>> +#define	CONFIG_FEC_DATA_ALIGNMENT	ARCH_DMA_MINALIGN
>> +#endif
>
> The above shoul be the bigger of CONFIG_SYS_CACHELINE_SIZE and ARCH_DMA_ALIGN
> (use max() ).
>
> Eventually, we should unify cache and DMA stuff and make only one macro (ALIGN)
> and be done with it.
>
> btw. I can't seem to find a platform using different alignment for DATA and
> DESC, let's make it one ( ... #define CONFIG_FEC_ALIGN max(ARCH_DMA_MINALIGN,
> CONFIG_SYS_CACHELINE_SIZE), ok?
>

I was thinking the same thing but concerned that I missed something.

>> +
>> +/* Check various alignment issues at compile time */
>> +#if ((CONFIG_FEC_DESC_ALIGNMENT<  16) || (CONFIG_FEC_DESC_ALIGNMENT % 16
>> != 0)) +#error	"CONFIG_FEC_DESC_ALIGNMENT must be multiple of 16!"
>> +#endif
>> +
>> +#if ((CONFIG_FEC_DATA_ALIGNMENT<  16) || (CONFIG_FEC_DATA_ALIGNMENT % 16
>> != 0)) +#error	"CONFIG_FEC_DATA_ALIGNMENT must be multiple of 16!"
>> +#endif
>> +
>> +#if ((PKTALIGN<  CONFIG_FEC_DATA_ALIGNMENT) || \
>> +	(PKTALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
>> +#error	"PKTALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
>> +#endif
>> +
>> +#if ((PKTSIZE_ALIGN<  CONFIG_FEC_DATA_ALIGNMENT) || \
>> +	(PKTSIZE_ALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
>> +#error	"PKTSIZE_ALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
>> +#endif
>> +
>>   #undef DEBUG
>>
>>   struct nbuf {
>> @@ -259,43 +286,52 @@ static int fec_tx_task_disable(struct fec_priv *fec)
>>    * Initialize receive task's buffer descriptors
>>    * @param[in] fec all we know about the device yet
>>    * @param[in] count receive buffer count to be allocated
>> - * @param[in] size size of each receive buffer
>> + * @param[in] dsize desired size of each receive buffer
>>    * @return 0 on success
>>    *
>>    * For this task we need additional memory for the data buffers. And each
>>    * data buffer requires some alignment. Thy must be aligned to a specific
>> - * boundary each (DB_DATA_ALIGNMENT).
>> + * boundary each.
>>    */
>> -static int fec_rbd_init(struct fec_priv *fec, int count, int size)
>> +static int fec_rbd_init(struct fec_priv *fec, int count, int dsize)
>>   {
>> -	int ix;
>> -	uint32_t p = 0;
>> -
>> -	/* reserve data memory and consider alignment */
>> -	if (fec->rdb_ptr == NULL)
>> -		fec->rdb_ptr = malloc(size * count + DB_DATA_ALIGNMENT);
>> -	p = (uint32_t)fec->rdb_ptr;
>> -	if (!p) {
>> -		puts("fec_mxc: not enough malloc memory\n");
>> -		return -ENOMEM;
>> -	}
>> -	memset((void *)p, 0, size * count + DB_DATA_ALIGNMENT);
>> -	p += DB_DATA_ALIGNMENT-1;
>> -	p&= ~(DB_DATA_ALIGNMENT-1);
>> -
>> -	for (ix = 0; ix<  count; ix++) {
>> -		writel(p,&fec->rbd_base[ix].data_pointer);
>> -		p += size;
>> -		writew(FEC_RBD_EMPTY,&fec->rbd_base[ix].status);
>> -		writew(0,&fec->rbd_base[ix].data_length);
>> -	}
>> +	uint32_t size;
>> +	int i;
>> +
>>   	/*
>> -	 * mark the last RBD to close the ring
>> +	 * Allocate memory for the buffers. This allocation respects the
>> +	 * alignment
>>   	 */
>> -	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY,&fec->rbd_base[ix - 1].status);
>> +	size = roundup(dsize, CONFIG_FEC_DATA_ALIGNMENT);
>> +	for (i = 0; i<  count; i++) {
>> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
>> +		if (0 == data_ptr) {
>
> We should fix this yoda condition while at it.
>

Wise, Yoda was...

I've been in the habit of putting constants on the left ever since reading
"Code Complete" many years ago.

That said, I'll swap it in V3.

>> +			uint8_t *data = memalign(CONFIG_FEC_DATA_ALIGNMENT,
>> +						 size);
>
> NOTE[1]: This is good, each RECEIVE DATA BUFFER is properly aligned at
> allocation time and therefore IS CACHE SAFE.
>
>> +			if (!data) {
>> +				printf("%s: error allocating rxbuf %d\n",
>> +				       __func__, i);
>> +				goto err;
>> +			}
>> +			writel((uint32_t)data,&fec->rbd_base[i].data_pointer);
>
> I think this "writel()" call is bogus and should be removed in subsequent patch
> and replaced with simple assignment. It was here probably due to cache issues on
> PPC?
>

The RBD has me puzzled. Do we treat them like registers and use readx/writex
or like in-RAM data structures...

>> +		} /* needs allocation */
>> +		writew(FEC_RBD_EMPTY,&fec->rbd_base[i].status);
>> +		writew(0,&fec->rbd_base[i].data_length);
>> +	}
>> +
>> +	/* Mark the last RBD to close the ring. */
>> +	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY,&fec->rbd_base[i - 1].status);
>>   	fec->rbd_index = 0;
>>
>>   	return 0;
>> +
>> +err:
>> +	for (; i>= 0; i--) {
>> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
>> +		free((void *)data_ptr);
>> +	}
>> +
>> +	return -ENOMEM;
>>   }
>>
>>   /**
>> @@ -387,12 +423,25 @@ static int fec_open(struct eth_device *edev)
>>   {
>>   	struct fec_priv *fec = (struct fec_priv *)edev->priv;
>>   	int speed;
>> +	uint32_t addr, size;
>> +	int i;
>>
>>   	debug("fec_open: fec_open(dev)\n");
>>   	/* full-duplex, heartbeat disabled */
>>   	writel(1<<  2,&fec->eth->x_cntrl);
>>   	fec->rbd_index = 0;
>>
>> +	/* Invalidate all descriptors */
>> +	for (i = 0; i<  FEC_RBD_NUM - 1; i++)
>> +		fec_rbd_clean(0,&fec->rbd_base[i]);
>> +	fec_rbd_clean(1,&fec->rbd_base[i]);
>> +
>> +	/* Flush the descriptors into RAM */
>> +	size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
>> +			CONFIG_FEC_DATA_ALIGNMENT);
>> +	addr = (uint32_t)fec->rbd_base;
>> +	flush_dcache_range(addr, addr + size);
>> +
>>   #ifdef FEC_QUIRK_ENET_MAC
>>   	/* Enable ENET HW endian SWAP */
>>   	writel(readl(&fec->eth->ecntrl) | FEC_ECNTRL_DBSWAP,
>> @@ -478,38 +527,44 @@ static int fec_open(struct eth_device *edev)
>>
>>   static int fec_init(struct eth_device *dev, bd_t* bd)
>>   {
>> -	uint32_t base;
>>   	struct fec_priv *fec = (struct fec_priv *)dev->priv;
>>   	uint32_t mib_ptr = (uint32_t)&fec->eth->rmon_t_drop;
>>   	uint32_t rcntrl;
>> -	int i;
>> +	uint32_t size;
>> +	int i, ret;
>>
>>   	/* Initialize MAC address */
>>   	fec_set_hwaddr(dev);
>>
>>   	/*
>> -	 * reserve memory for both buffer descriptor chains at once
>> -	 * Datasheet forces the startaddress of each chain is 16 byte
>> -	 * aligned
>> +	 * Allocate transmit descriptors, there are two in total. This
>> +	 * allocation respects cache alignment.
>>   	 */
>> -	if (fec->base_ptr == NULL)
>> -		fec->base_ptr = malloc((2 + FEC_RBD_NUM) *
>> -				sizeof(struct fec_bd) + DB_ALIGNMENT);
>> -	base = (uint32_t)fec->base_ptr;
>> -	if (!base) {
>> -		puts("fec_mxc: not enough malloc memory\n");
>> -		return -ENOMEM;
>> +	if (!fec->tbd_base) {
>> +		size = roundup(2 * sizeof(struct fec_bd),
>> +				CONFIG_FEC_DATA_ALIGNMENT);
>> +		fec->tbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
>
> NOTE[2]: This allocates exactly 2 * 8 bytes == 2 * sizeof(struct fec_bd). This
> means on systems with cache size of 32 bytes, we need to be especially careful.
> This matter is handled by the up-alignment of this data, so this part IS CACHE
> SAFE.
>
>> +		if (!fec->tbd_base) {
>> +			ret = -ENOMEM;
>> +			goto err1;
>> +		}
>> +		memset(fec->tbd_base, 0, size);
>
> We might want to flush the descriptors to memory after they have been inited?
>

Good catch.

>>   	}
>> -	memset((void *)base, 0, (2 + FEC_RBD_NUM) *
>> -			sizeof(struct fec_bd) + DB_ALIGNMENT);
>> -	base += (DB_ALIGNMENT-1);
>> -	base&= ~(DB_ALIGNMENT-1);
>> -
>> -	fec->rbd_base = (struct fec_bd *)base;
>>
>> -	base += FEC_RBD_NUM * sizeof(struct fec_bd);
>> -
>> -	fec->tbd_base = (struct fec_bd *)base;
>> +	/*
>> +	 * Allocate receive descriptors. This allocation respects cache
>> +	 * alignment.
>> +	 */
>> +	if (!fec->rbd_base) {
>> +		size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
>> +				CONFIG_FEC_DATA_ALIGNMENT);
>> +		fec->rbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
>
> NOTE[3]: This allocates exactly 64 * 8 bytes == 64 * sizeof(struct fec_bd). This
> means on systems with cache size of 32 bytes, we need to be especially careful.
> This matter is handled by the up-alignment of this data, so this part IS CACHE
> SAFE.
>
>> +		if (!fec->rbd_base) {
>> +			ret = -ENOMEM;
>> +			goto err2;
>> +		}
>> +		memset(fec->rbd_base, 0, size);
>> +	}
>
> We might want to flush the descriptors to memory after they have been inited?
 >

Again, good catch.

On this topic (initialization of RBD), I had a bit of a concern
regarding the initialization of things.

In fec_open, the receive buffer descriptors are initialized and the
last one set is to 'wrap'. If this code were to execute when the
controller is live, bad things would surely happen.

I traced through all of the paths I can see, and I believe that
we're safe. It appears that fec_halt() will be called prior to
any call to fec_init() and fec_open().

In fec_open() a number of calls to fec_rbd_clean() are made and
a single flush_dcache() is made afterwards.

While this works and causes less thrashing than per-RBD flushes,
I think the code would be simpler if fec_rbd_clean just did the
flush itself. This would also remove the need for a separate
flush in fec_recv.

>>
>>   	/*
>>   	 * Set interrupt mask register
>> @@ -570,9 +625,8 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>>   	 * Initialize RxBD/TxBD rings
>>   	 */
>>   	if (fec_rbd_init(fec, FEC_RBD_NUM, FEC_MAX_PKT_SIZE)<  0) {
>> -		free(fec->base_ptr);
>> -		fec->base_ptr = NULL;
>> -		return -ENOMEM;
>> +		ret = -ENOMEM;
>> +		goto err3;
>>   	}
>>   	fec_tbd_init(fec);
>>
>> @@ -583,6 +637,13 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>>   #endif
>>   	fec_open(dev);
>>   	return 0;
>> +
>> +err3:
>> +	free(fec->rbd_base);
>> +err2:
>> +	free(fec->tbd_base);
>> +err1:
>> +	return ret;
>>   }
>>
>
> #####
> To sum it up, so far, we have allocated and properly aligned both the
> descriptors and the receive data buffers.
>
> Important remark, there will be likely more descriptors in one cacheline!
> #####
>
>>   /**
>> @@ -631,9 +692,11 @@ static void fec_halt(struct eth_device *dev)
>>    * @param[in] length Data count in bytes
>>    * @return 0 on success
>>    */
>> -static int fec_send(struct eth_device *dev, volatile void* packet, int
>> length) +static int fec_send(struct eth_device *dev, void *packet, int
>> length) {
>>   	unsigned int status;
>> +	uint32_t size;
>> +	uint32_t addr;
>>
>>   	/*
>>   	 * This routine transmits one frame.  This routine only accepts
>> @@ -650,15 +713,21 @@ static int fec_send(struct eth_device *dev, volatile
>> void* packet, int length) }
>>
>>   	/*
>> -	 * Setup the transmit buffer
>> -	 * Note: We are always using the first buffer for transmission,
>> -	 * the second will be empty and only used to stop the DMA engine
>> +	 * Setup the transmit buffer. We are always using the first buffer for
>> +	 * transmission, the second will be empty and only used to stop the DMA
>> +	 * engine. We also flush the packet to RAM here to avoid cache trouble.
>>   	 */
>>   #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
>>   	swap_packet((uint32_t *)packet, length);
>>   #endif
>> +
>> +	addr = (uint32_t)packet;
>> +	size = roundup(length, CONFIG_FEC_DATA_ALIGNMENT);
>> +	flush_dcache_range(addr, addr + size);
>> +
>
> ERROR[1]: The above seems wrong, it might flush more than necessary. Drop the
> roundup() call here so flush_dcache_range() can scream about cache issues and so
> we can find them.
>

I did this and found that all of the places which transmit packets are
aligned to PKT_ALIGN (32 bytes).

>>   	writew(length,&fec->tbd_base[fec->tbd_index].data_length);
>> -	writel((uint32_t)packet,&fec->tbd_base[fec->tbd_index].data_pointer);
>> +	writel(addr,&fec->tbd_base[fec->tbd_index].data_pointer);
>> +
>>   	/*
>>   	 * update BD's status now
>>   	 * This block:
>> @@ -672,16 +741,30 @@ static int fec_send(struct eth_device *dev, volatile
>> void* packet, int length) writew(status,
>> &fec->tbd_base[fec->tbd_index].status);
>>
>>   	/*
>> +	 * Flush data cache. This code flushes both TX descriptors to RAM.
>> +	 * After this code this code, the descritors will be safely in RAM
>
> Please fix "this code this code".
>
Done. Also 'descritors'.

>> +	 * and we can start DMA.
>> +	 */
>> +	size = roundup(2 * sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
>> +	addr = (uint32_t)fec->tbd_base;
>> +	flush_dcache_range(addr, addr + size);
>
> NOTE[4]: This is correct, we must flush both TX descriptors here.
>
>> +
>> +	/*
>>   	 * Enable SmartDMA transmit task
>>   	 */
>>   	fec_tx_task_enable(fec);
>>
>>   	/*
>> -	 * wait until frame is sent .
>> +	 * Wait until frame is sent. On each turn of the wait cycle, we must
>> +	 * invalidate data cache to see what's really in RAM. Also, we need
>> +	 * barrier here.
>>   	 */
>> +	invalidate_dcache_range(addr, addr + size);
>>   	while (readw(&fec->tbd_base[fec->tbd_index].status)&  FEC_TBD_READY) {
>>   		udelay(1);
>> +		invalidate_dcache_range(addr, addr + size);
>>   	}
>
> NOTE[5]: This is correct, we must invalidate cache every time we want to read
> fresh version of the descriptor.
>
>> +
>>   	debug("fec_send: status 0x%x index %d\n",
>>   			readw(&fec->tbd_base[fec->tbd_index].status),
>>   			fec->tbd_index);
>> @@ -707,6 +790,8 @@ static int fec_recv(struct eth_device *dev)
>>   	int frame_length, len = 0;
>>   	struct nbuf *frame;
>>   	uint16_t bd_status;
>> +	uint32_t addr, size;
>> +	int i;
>>   	uchar buff[FEC_MAX_PKT_SIZE];
>>
>>   	/*
>> @@ -737,8 +822,23 @@ static int fec_recv(struct eth_device *dev)
>>   	}
>>
>>   	/*
>> -	 * ensure reading the right buffer status
>> +	 * Read the buffer status. Before the status can be read, the data cache
>> +	 * must be invalidated, because the data in RAM might have been changed
>> +	 * by DMA. The descriptors are properly aligned to cachelines so there's
>> +	 * no need to worry they'd overlap.
>> +	 *
>> +	 * WARNING: By invalidating the descriptor here, we also invalidate
>> +	 * the descriptors surrounding this one. Therefore we can NOT change the
>> +	 * contents of this descriptor nor the surrounding ones. The problem is
>> +	 * that in order to mark the descriptor as processed, we need to change
>> +	 * the descriptor. The solution is to mark the whole cache line when all
>> +	 * descriptors in the cache line are processed.
>>   	 */
>> +	addr = (uint32_t)rbd;
>> +	addr&= ~(CONFIG_FEC_DATA_ALIGNMENT - 1);
>
> Aka some kind of round_down()
>
>> +	size = roundup(sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
>
> Aka roundup.
>
>> +	invalidate_dcache_range(addr, addr + size);
>> +
>
> The idea here is the following (demo uses 32byte long cachelines):
>
> [DESC1][DESC2][DESC3][DESC4][DESC5][DESC6]
> `------- cacheline --------'
>
> We want to start retrieving contents of DESC3, therefore "addr" points to DESC1,
> "size" is size of cacheline (I hope there's no hardware with 8 byte cachelines,
> but this should be ok here).
>
> NOTE[5]: Here we can invalidate the whole cacheline, because the descriptors so
> far were modified only be hardware, not by us. We are not writing anything there
> so we won't loose any information.
>
> NOTE[6]: This invalidation ensures that we always have a fresh copy of the
> cacheline containing all the descriptors, therefore we always have a fresh
> status of the descriptors we are about to pick. Since this is a sequential
> execution, the cache eviction should not kick in here (or might it?).
>
>
Another way to look at this is this:

	After fec_open(), the hardware owns the rbd, and all we should do
	is read it. In order to make sure we don't have a stale copy, we
	need to call invalidate() before looking at the values.

Tracing the code to find out whether this is true, the only write I see
is within fec_recv() when the last descriptor is full, when the driver
takes ownership of **all** of the descriptors, calling fec_rbd_clean()
on each.

The only thing that looks funky is this:

		size = (CONFIG_FEC_ALIGN / sizeof(struct fec_bd)) - 1;
		if ((fec->rbd_index & size) == size) {

Wouldn't a test of rbd_index against FEC_RBD_NUM be more appropriate?
i.e.
		if (fec->rbd_index == FEC_RBD_NUM-1) {

>>   	bd_status = readw(&rbd->status);
>>   	debug("fec_recv: status 0x%x\n", bd_status);
>>
>> @@ -751,6 +851,13 @@ static int fec_recv(struct eth_device *dev)
>>   			frame = (struct nbuf *)readl(&rbd->data_pointer);
>>   			frame_length = readw(&rbd->data_length) - 4;
>
> NOTE[7]: We received a frame, we know how long it is. The frame is loaded into
> the rbd->data_pointer, which IS CACHE ALIGNED.
>
> I detect a problem here that frame_length might overflow, but that's not related
> to caches and might be just false accusation.
>
>>   			/*
>> +			 * Invalidate data cache over the buffer
>> +			 */
>> +			addr = (uint32_t)frame;
>> +			size = roundup(frame_length, CONFIG_FEC_DATA_ALIGNMENT);
>> +			invalidate_dcache_range(addr, addr + size);
>
> NOTE[8]: The roundup here is a valid operation, we can flush up to the size of
> rbd->data_pointer, which is cache aligned and considering frame_length is less
> or equal to the memory allocated for rbd->data_pointer, the invalidation of
> cache here IS SAFE.
>
>> +
>> +			/*
>>   			 *  Fill the buffer and pass it to upper layers
>>   			 */
>>   #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
>> @@ -765,11 +872,27 @@ static int fec_recv(struct eth_device *dev)
>>   						(ulong)rbd->data_pointer,
>>   						bd_status);
>>   		}
>> +
>>   		/*
>> -		 * free the current buffer, restart the engine
>> -		 * and move forward to the next buffer
>> +		 * Free the current buffer, restart the engine and move forward
>> +		 * to the next buffer. Here we check if the whole cacheline of
>> +		 * descriptors was already processed and if so, we mark it free
>> +		 * as whole.
>>   		 */
>
> NOTE[9]: This is the most problematic part, handling the marking of RX
> descriptors with a flag that they are ready again. Obviously, directly writing
> to the desciptor won't work, because:
>
> 1) There are multiple descriptors on a cacheline, therefore by writing one, we'd
> need to flush the whole cacheline back into DRAM immediatelly so the hardware
> knows about it. But that'd mean we can loose some information from the hardware,
> refer to demo before NOTE[5]:
>
> We modify DESC2 and hardware is concurently changing DESC3. We flush DESC2 and
> the whole cacheline, we loose part of DESC3.
>
> 2) Cache eviction algorithm might flush data for us, therefore causing loss of
> information as well.
>
> The solution is the following:
>
> 1) Compute how many descriptors are per-cache line
> 2) Make sure FEC_RBD_NUM * sizeof(struct fec_bd) is at least 2 *
> CONFIG_FEC_DATA_ALIGNMENT in size, see NOTE[11].
> 3) Once the last RX buffer in the cacheline is processed, mark them all clean
> and flush them all, see NOTE[10].
>
> NOTE[10]: This is legal, because the hardware won't use RX descriptors that it
> marked dirty (which means not picked up by software yet). We clean the
> desciptors in an order the hardware would pick them up again so there's no
> problem with race condition either. The only possible issue here is if there was
> hardware with cacheline size smaller than descriptor size (we should add a check
> for this at the begining of the file).
>
> NOTE[11]: This is because we want the FEC to overwrite descriptors below the
> other cacheline while we're marking the one containing retrieved descriptors
> clean.
>

Ahah! Now I see what the size calculation is doing.

A well-named constant, maybe "RXDESC_PER_CACHELINE" would be useful here.

	#define RXDESC_PER_CACHELINE	(CONFIG_FEC_ALIGN/sizeof(struct fec_bd))

>> -		fec_rbd_clean(fec->rbd_index == (FEC_RBD_NUM - 1) ? 1 : 0, rbd);
>> +		size = (CONFIG_FEC_DATA_ALIGNMENT / sizeof(struct fec_bd)) - 1;
		size = RXDESC_PER_CACHELINE-1;
>> +		if ((fec->rbd_index&  size) == size) {

The line above only works if RXDESC_PER_CACHELINE is a multiple of 2, which
is likely to work because sizeof(struct fec_bd) == 8.

>> +			i = fec->rbd_index - size;
>> +			addr = (uint32_t)&fec->rbd_base[i];
>> +			for (; i<= fec->rbd_index + size; i++) {

This flushes too many descriptors! This should be:
			for (; i<= fec->rbd_index; i++) {

>> +				if (i == (FEC_RBD_NUM - 1))
>> +					fec_rbd_clean(1,&fec->rbd_base[i]);
>> +				else
>> +					fec_rbd_clean(0,&fec->rbd_base[i]);
>> +			}
>> +			flush_dcache_range(addr,
>> +				addr + CONFIG_FEC_DATA_ALIGNMENT);
>> +		}
>> +
>>   		fec_rx_task_enable(fec);
>>   		fec->rbd_index = (fec->rbd_index + 1) % FEC_RBD_NUM;
>>   	}
>> diff --git a/drivers/net/fec_mxc.h b/drivers/net/fec_mxc.h
>> index 2eb7803..852b2e0 100644
>> --- a/drivers/net/fec_mxc.h
>> +++ b/drivers/net/fec_mxc.h
>> @@ -234,22 +234,6 @@ struct ethernet_regs {
>>   #endif
>>
>>   /**
>> - * @brief Descriptor buffer alignment
>> - *
>> - * i.MX27 requires a 16 byte alignment (but for the first element only)
>> - */
>> -#define DB_ALIGNMENT		16
>> -
>> -/**
>> - * @brief Data buffer alignment
>> - *
>> - * i.MX27 requires a four byte alignment for transmit and 16 bits
>> - * alignment for receive so take 16
>> - * Note: Valid for member data_pointer in struct buffer_descriptor
>> - */
>> -#define DB_DATA_ALIGNMENT	16
>> -
>> -/**
>>    * @brief Receive&  Transmit Buffer Descriptor definitions
>>    *
>>    * Note: The first BD must be aligned (see DB_ALIGNMENT)
>> @@ -282,8 +266,7 @@ struct fec_priv {
>>   	struct fec_bd *tbd_base;	/* TBD ring */
>>   	int tbd_index;			/* next transmit BD to write */
>>   	bd_t *bd;
>> -	void *rdb_ptr;
>> -	void *base_ptr;
>> +	uint8_t *tdb_ptr;
>>   	int dev_id;
>>   	int phy_id;
>>   	struct mii_dev *bus;
> Uh, this was tough.

How bad do we want cache?
Eric Nelson - March 6, 2012, 6:08 p.m.
On 03/06/2012 10:06 AM, Eric Nelson wrote:
> On 03/05/2012 01:06 PM, Marek Vasut wrote:
>> Dear Eric Nelson,
>>
 >> <snip>
 >>
>>> + if (!fec->rbd_base) {
>>> + ret = -ENOMEM;
>>> + goto err2;
>>> + }
>>> + memset(fec->rbd_base, 0, size);
>>> + }
>>
>> We might want to flush the descriptors to memory after they have been inited?
>

We're also missing a flush after the call to fec_rbd_init().

I'm inclined to move that call to right after the memset and before
a newly-added flush and do the same with the call to tbd_init().
Marek Vasut - March 6, 2012, 7:45 p.m.
Dear Eric Nelson,

> On 03/05/2012 01:06 PM, Marek Vasut wrote:
> > Dear Eric Nelson,
> > 
> >> 	ensure that transmit and receive buffers are cache-line aligned
> >> 	
> >>          invalidate cache after each packet received
> >>          flush cache before transmitting
> >> 	
> >> 	Original patch by Marek:
> >> 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html
> > 
> > Would be cool to Cc me :-p
> 
> Sorry about that.

It's ok, don't worry about it ;-)

[...]

> > 
> > I think this "writel()" call is bogus and should be removed in subsequent
> > patch and replaced with simple assignment. It was here probably due to
> > cache issues on PPC?
> 
> The RBD has me puzzled. Do we treat them like registers and use
> readx/writex or like in-RAM data structures...

I'd go for the in-RAM data structures, but such conversion should happen in a 
separate patch only AFTER the cache support is in.

[...]

> >> +		if (!fec->rbd_base) {
> >> +			ret = -ENOMEM;
> >> +			goto err2;
> >> +		}
> >> +		memset(fec->rbd_base, 0, size);
> >> +	}
> > 
> > We might want to flush the descriptors to memory after they have been
> > inited?
> 
> Again, good catch.
> 
> On this topic (initialization of RBD), I had a bit of a concern
> regarding the initialization of things.
> 
> In fec_open, the receive buffer descriptors are initialized and the
> last one set is to 'wrap'. If this code were to execute when the
> controller is live, bad things would surely happen.
> 
> I traced through all of the paths I can see, and I believe that
> we're safe. It appears that fec_halt() will be called prior to
> any call to fec_init() and fec_open().

Yes, this will only happen if something went wrong.

> 
> In fec_open() a number of calls to fec_rbd_clean() are made and
> a single flush_dcache() is made afterwards.
> 
> While this works and causes less thrashing than per-RBD flushes,
> I think the code would be simpler if fec_rbd_clean just did the
> flush itself. This would also remove the need for a separate
> flush in fec_recv.

Not really, rbd might be (and likely is) smaller than cache line, therefore you 
won't be able to flush single rbd, unless it's cacheline aligned, which wastes 
space.

[...]

> >> +	invalidate_dcache_range(addr, addr + size);
> >> +
> > 
> > The idea here is the following (demo uses 32byte long cachelines):
> > 
> > [DESC1][DESC2][DESC3][DESC4][DESC5][DESC6]
> > `------- cacheline --------'
> > 
> > We want to start retrieving contents of DESC3, therefore "addr" points to
> > DESC1, "size" is size of cacheline (I hope there's no hardware with 8
> > byte cachelines, but this should be ok here).
> > 
> > NOTE[5]: Here we can invalidate the whole cacheline, because the
> > descriptors so far were modified only be hardware, not by us. We are not
> > writing anything there so we won't loose any information.
> > 
> > NOTE[6]: This invalidation ensures that we always have a fresh copy of
> > the cacheline containing all the descriptors, therefore we always have a
> > fresh status of the descriptors we are about to pick. Since this is a
> > sequential execution, the cache eviction should not kick in here (or
> > might it?).
> 
> Another way to look at this is this:
> 
> 	After fec_open(), the hardware owns the rbd, and all we should do
> 	is read it. In order to make sure we don't have a stale copy, we
> 	need to call invalidate() before looking at the values.
> 
> Tracing the code to find out whether this is true, the only write I see
> is within fec_recv() when the last descriptor is full, when the driver
> takes ownership of **all** of the descriptors, calling fec_rbd_clean()
> on each.
> 
> The only thing that looks funky is this:
> 
> 		size = (CONFIG_FEC_ALIGN / sizeof(struct fec_bd)) - 1;
> 		if ((fec->rbd_index & size) == size) {
> 
> Wouldn't a test of rbd_index against FEC_RBD_NUM be more appropriate?
> i.e.
> 		if (fec->rbd_index == FEC_RBD_NUM-1) {

I believe the FEC doesn't always start from rbd_index == 0, and if you were to 
receive more than 64 rbds between open() and close(), this implementation works, 
your would fail.

> 
> >>   	bd_status = readw(&rbd->status);
> >>   	debug("fec_recv: status 0x%x\n", bd_status);
> >> 
> >> @@ -751,6 +851,13 @@ static int fec_recv(struct eth_device *dev)
> >> 
> >>   			frame = (struct nbuf *)readl(&rbd->data_pointer);
> >>   			frame_length = readw(&rbd->data_length) - 4;
> > 
> > NOTE[7]: We received a frame, we know how long it is. The frame is loaded
> > into the rbd->data_pointer, which IS CACHE ALIGNED.
> > 
> > I detect a problem here that frame_length might overflow, but that's not
> > related to caches and might be just false accusation.
> > 
> >>   			/*
> >> 
> >> +			 * Invalidate data cache over the buffer
> >> +			 */
> >> +			addr = (uint32_t)frame;
> >> +			size = roundup(frame_length, CONFIG_FEC_DATA_ALIGNMENT);
> >> +			invalidate_dcache_range(addr, addr + size);
> > 
> > NOTE[8]: The roundup here is a valid operation, we can flush up to the
> > size of rbd->data_pointer, which is cache aligned and considering
> > frame_length is less or equal to the memory allocated for
> > rbd->data_pointer, the invalidation of cache here IS SAFE.
> > 
> >> +
> >> +			/*
> >> 
> >>   			 *  Fill the buffer and pass it to upper layers
> >>   			 */
> >>   
> >>   #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
> >> 
> >> @@ -765,11 +872,27 @@ static int fec_recv(struct eth_device *dev)
> >> 
> >>   						(ulong)rbd->data_pointer,
> >>   						bd_status);
> >>   		
> >>   		}
> >> 
> >> +
> >> 
> >>   		/*
> >> 
> >> -		 * free the current buffer, restart the engine
> >> -		 * and move forward to the next buffer
> >> +		 * Free the current buffer, restart the engine and move forward
> >> +		 * to the next buffer. Here we check if the whole cacheline of
> >> +		 * descriptors was already processed and if so, we mark it free
> >> +		 * as whole.
> >> 
> >>   		 */
> > 
> > NOTE[9]: This is the most problematic part, handling the marking of RX
> > descriptors with a flag that they are ready again. Obviously, directly
> > writing to the desciptor won't work, because:
> > 
> > 1) There are multiple descriptors on a cacheline, therefore by writing
> > one, we'd need to flush the whole cacheline back into DRAM immediatelly
> > so the hardware knows about it. But that'd mean we can loose some
> > information from the hardware, refer to demo before NOTE[5]:
> > 
> > We modify DESC2 and hardware is concurently changing DESC3. We flush
> > DESC2 and the whole cacheline, we loose part of DESC3.
> > 
> > 2) Cache eviction algorithm might flush data for us, therefore causing
> > loss of information as well.
> > 
> > The solution is the following:
> > 
> > 1) Compute how many descriptors are per-cache line
> > 2) Make sure FEC_RBD_NUM * sizeof(struct fec_bd) is at least 2 *
> > CONFIG_FEC_DATA_ALIGNMENT in size, see NOTE[11].
> > 3) Once the last RX buffer in the cacheline is processed, mark them all
> > clean and flush them all, see NOTE[10].
> > 
> > NOTE[10]: This is legal, because the hardware won't use RX descriptors
> > that it marked dirty (which means not picked up by software yet). We
> > clean the desciptors in an order the hardware would pick them up again
> > so there's no problem with race condition either. The only possible
> > issue here is if there was hardware with cacheline size smaller than
> > descriptor size (we should add a check for this at the begining of the
> > file).
> > 
> > NOTE[11]: This is because we want the FEC to overwrite descriptors below
> > the other cacheline while we're marking the one containing retrieved
> > descriptors clean.
> 
> Ahah! Now I see what the size calculation is doing.
> 
> A well-named constant, maybe "RXDESC_PER_CACHELINE" would be useful here.

Yes

> 
> 	#define RXDESC_PER_CACHELINE	(CONFIG_FEC_ALIGN/sizeof(struct fec_bd))
> 
> >> -		fec_rbd_clean(fec->rbd_index == (FEC_RBD_NUM - 1) ? 1 : 0, rbd);
> >> +		size = (CONFIG_FEC_DATA_ALIGNMENT / sizeof(struct fec_bd)) - 1;
> 
> 		size = RXDESC_PER_CACHELINE-1;
> 
> >> +		if ((fec->rbd_index&  size) == size) {
> 
> The line above only works if RXDESC_PER_CACHELINE is a multiple of 2, which
> is likely to work because sizeof(struct fec_bd) == 8.

Adding such a comment (and maybe CPP check) won't hurt.
> 
> >> +			i = fec->rbd_index - size;
> >> +			addr = (uint32_t)&fec->rbd_base[i];
> >> +			for (; i<= fec->rbd_index + size; i++) {
> 
> This flushes too many descriptors! This should be:
> 			for (; i<= fec->rbd_index; i++) {

Agreed

> 
> >> +				if (i == (FEC_RBD_NUM - 1))
> >> +					fec_rbd_clean(1,&fec->rbd_base[i]);
> >> +				else
> >> +					fec_rbd_clean(0,&fec->rbd_base[i]);
> >> +			}
> >> +			flush_dcache_range(addr,
> >> +				addr + CONFIG_FEC_DATA_ALIGNMENT);
> >> +		}
> >> +
> >> 
> >>   		fec_rx_task_enable(fec);
> >>   		fec->rbd_index = (fec->rbd_index + 1) % FEC_RBD_NUM;
> >>   	
> >>   	}
> >> 
> >> diff --git a/drivers/net/fec_mxc.h b/drivers/net/fec_mxc.h
> >> index 2eb7803..852b2e0 100644
> >> --- a/drivers/net/fec_mxc.h
> >> +++ b/drivers/net/fec_mxc.h
> >> @@ -234,22 +234,6 @@ struct ethernet_regs {
> >> 
> >>   #endif
> >>   
> >>   /**
> >> 
> >> - * @brief Descriptor buffer alignment
> >> - *
> >> - * i.MX27 requires a 16 byte alignment (but for the first element only)
> >> - */
> >> -#define DB_ALIGNMENT		16
> >> -
> >> -/**
> >> - * @brief Data buffer alignment
> >> - *
> >> - * i.MX27 requires a four byte alignment for transmit and 16 bits
> >> - * alignment for receive so take 16
> >> - * Note: Valid for member data_pointer in struct buffer_descriptor
> >> - */
> >> -#define DB_DATA_ALIGNMENT	16
> >> -
> >> -/**
> >> 
> >>    * @brief Receive&  Transmit Buffer Descriptor definitions
> >>    *
> >>    * Note: The first BD must be aligned (see DB_ALIGNMENT)
> >> 
> >> @@ -282,8 +266,7 @@ struct fec_priv {
> >> 
> >>   	struct fec_bd *tbd_base;	/* TBD ring */
> >>   	int tbd_index;			/* next transmit BD to write */
> >>   	bd_t *bd;
> >> 
> >> -	void *rdb_ptr;
> >> -	void *base_ptr;
> >> +	uint8_t *tdb_ptr;
> >> 
> >>   	int dev_id;
> >>   	int phy_id;
> >>   	struct mii_dev *bus;
> > 
> > Uh, this was tough.
> 
> How bad do we want cache?

We're almost there, why do you ask? :-)
Marek Vasut - March 6, 2012, 7:49 p.m.
Dear Eric Nelson,

> On 03/06/2012 10:06 AM, Eric Nelson wrote:
> > On 03/05/2012 01:06 PM, Marek Vasut wrote:
> >> Dear Eric Nelson,
> >> 
>  >> <snip>
>  >> 
> >>> + if (!fec->rbd_base) {
> >>> + ret = -ENOMEM;
> >>> + goto err2;
> >>> + }
> >>> + memset(fec->rbd_base, 0, size);
> >>> + }
> >> 
> >> We might want to flush the descriptors to memory after they have been
> >> inited?
> 
> We're also missing a flush after the call to fec_rbd_init().

I think we need only one flush in the whole allocation sequence. But you're 
probably right here.
> 
> I'm inclined to move that call to right after the memset and before
> a newly-added flush and do the same with the call to tbd_init().

You mean into fec_rbd_init() ?

Best regards,
Marek Vasut

Patch

diff --git a/drivers/net/fec_mxc.c b/drivers/net/fec_mxc.c
index 1fdd071..0e35377 100644
--- a/drivers/net/fec_mxc.c
+++ b/drivers/net/fec_mxc.c
@@ -50,6 +50,33 @@  DECLARE_GLOBAL_DATA_PTR;
 #define	CONFIG_FEC_MXC_SWAP_PACKET
 #endif
 
+#ifndef	CONFIG_FEC_DESC_ALIGNMENT
+#define	CONFIG_FEC_DESC_ALIGNMENT	ARCH_DMA_MINALIGN
+#endif
+
+#ifndef	CONFIG_FEC_DATA_ALIGNMENT
+#define	CONFIG_FEC_DATA_ALIGNMENT	ARCH_DMA_MINALIGN
+#endif
+
+/* Check various alignment issues at compile time */
+#if ((CONFIG_FEC_DESC_ALIGNMENT < 16) || (CONFIG_FEC_DESC_ALIGNMENT % 16 != 0))
+#error	"CONFIG_FEC_DESC_ALIGNMENT must be multiple of 16!"
+#endif
+
+#if ((CONFIG_FEC_DATA_ALIGNMENT < 16) || (CONFIG_FEC_DATA_ALIGNMENT % 16 != 0))
+#error	"CONFIG_FEC_DATA_ALIGNMENT must be multiple of 16!"
+#endif
+
+#if ((PKTALIGN < CONFIG_FEC_DATA_ALIGNMENT) || \
+	(PKTALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
+#error	"PKTALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
+#endif
+
+#if ((PKTSIZE_ALIGN < CONFIG_FEC_DATA_ALIGNMENT) || \
+	(PKTSIZE_ALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
+#error	"PKTSIZE_ALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
+#endif
+
 #undef DEBUG
 
 struct nbuf {
@@ -259,43 +286,52 @@  static int fec_tx_task_disable(struct fec_priv *fec)
  * Initialize receive task's buffer descriptors
  * @param[in] fec all we know about the device yet
  * @param[in] count receive buffer count to be allocated
- * @param[in] size size of each receive buffer
+ * @param[in] dsize desired size of each receive buffer
  * @return 0 on success
  *
  * For this task we need additional memory for the data buffers. And each
  * data buffer requires some alignment. Thy must be aligned to a specific
- * boundary each (DB_DATA_ALIGNMENT).
+ * boundary each.
  */
-static int fec_rbd_init(struct fec_priv *fec, int count, int size)
+static int fec_rbd_init(struct fec_priv *fec, int count, int dsize)
 {
-	int ix;
-	uint32_t p = 0;
-
-	/* reserve data memory and consider alignment */
-	if (fec->rdb_ptr == NULL)
-		fec->rdb_ptr = malloc(size * count + DB_DATA_ALIGNMENT);
-	p = (uint32_t)fec->rdb_ptr;
-	if (!p) {
-		puts("fec_mxc: not enough malloc memory\n");
-		return -ENOMEM;
-	}
-	memset((void *)p, 0, size * count + DB_DATA_ALIGNMENT);
-	p += DB_DATA_ALIGNMENT-1;
-	p &= ~(DB_DATA_ALIGNMENT-1);
-
-	for (ix = 0; ix < count; ix++) {
-		writel(p, &fec->rbd_base[ix].data_pointer);
-		p += size;
-		writew(FEC_RBD_EMPTY, &fec->rbd_base[ix].status);
-		writew(0, &fec->rbd_base[ix].data_length);
-	}
+	uint32_t size;
+	int i;
+
 	/*
-	 * mark the last RBD to close the ring
+	 * Allocate memory for the buffers. This allocation respects the
+	 * alignment
 	 */
-	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY, &fec->rbd_base[ix - 1].status);
+	size = roundup(dsize, CONFIG_FEC_DATA_ALIGNMENT);
+	for (i = 0; i < count; i++) {
+		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
+		if (0 == data_ptr) {
+			uint8_t *data = memalign(CONFIG_FEC_DATA_ALIGNMENT,
+						 size);
+			if (!data) {
+				printf("%s: error allocating rxbuf %d\n",
+				       __func__, i);
+				goto err;
+			}
+			writel((uint32_t)data, &fec->rbd_base[i].data_pointer);
+		} /* needs allocation */
+		writew(FEC_RBD_EMPTY, &fec->rbd_base[i].status);
+		writew(0, &fec->rbd_base[i].data_length);
+	}
+
+	/* Mark the last RBD to close the ring. */
+	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY, &fec->rbd_base[i - 1].status);
 	fec->rbd_index = 0;
 
 	return 0;
+
+err:
+	for (; i >= 0; i--) {
+		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
+		free((void *)data_ptr);
+	}
+
+	return -ENOMEM;
 }
 
 /**
@@ -387,12 +423,25 @@  static int fec_open(struct eth_device *edev)
 {
 	struct fec_priv *fec = (struct fec_priv *)edev->priv;
 	int speed;
+	uint32_t addr, size;
+	int i;
 
 	debug("fec_open: fec_open(dev)\n");
 	/* full-duplex, heartbeat disabled */
 	writel(1 << 2, &fec->eth->x_cntrl);
 	fec->rbd_index = 0;
 
+	/* Invalidate all descriptors */
+	for (i = 0; i < FEC_RBD_NUM - 1; i++)
+		fec_rbd_clean(0, &fec->rbd_base[i]);
+	fec_rbd_clean(1, &fec->rbd_base[i]);
+
+	/* Flush the descriptors into RAM */
+	size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
+			CONFIG_FEC_DATA_ALIGNMENT);
+	addr = (uint32_t)fec->rbd_base;
+	flush_dcache_range(addr, addr + size);
+
 #ifdef FEC_QUIRK_ENET_MAC
 	/* Enable ENET HW endian SWAP */
 	writel(readl(&fec->eth->ecntrl) | FEC_ECNTRL_DBSWAP,
@@ -478,38 +527,44 @@  static int fec_open(struct eth_device *edev)
 
 static int fec_init(struct eth_device *dev, bd_t* bd)
 {
-	uint32_t base;
 	struct fec_priv *fec = (struct fec_priv *)dev->priv;
 	uint32_t mib_ptr = (uint32_t)&fec->eth->rmon_t_drop;
 	uint32_t rcntrl;
-	int i;
+	uint32_t size;
+	int i, ret;
 
 	/* Initialize MAC address */
 	fec_set_hwaddr(dev);
 
 	/*
-	 * reserve memory for both buffer descriptor chains at once
-	 * Datasheet forces the startaddress of each chain is 16 byte
-	 * aligned
+	 * Allocate transmit descriptors, there are two in total. This
+	 * allocation respects cache alignment.
 	 */
-	if (fec->base_ptr == NULL)
-		fec->base_ptr = malloc((2 + FEC_RBD_NUM) *
-				sizeof(struct fec_bd) + DB_ALIGNMENT);
-	base = (uint32_t)fec->base_ptr;
-	if (!base) {
-		puts("fec_mxc: not enough malloc memory\n");
-		return -ENOMEM;
+	if (!fec->tbd_base) {
+		size = roundup(2 * sizeof(struct fec_bd),
+				CONFIG_FEC_DATA_ALIGNMENT);
+		fec->tbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
+		if (!fec->tbd_base) {
+			ret = -ENOMEM;
+			goto err1;
+		}
+		memset(fec->tbd_base, 0, size);
 	}
-	memset((void *)base, 0, (2 + FEC_RBD_NUM) *
-			sizeof(struct fec_bd) + DB_ALIGNMENT);
-	base += (DB_ALIGNMENT-1);
-	base &= ~(DB_ALIGNMENT-1);
-
-	fec->rbd_base = (struct fec_bd *)base;
 
-	base += FEC_RBD_NUM * sizeof(struct fec_bd);
-
-	fec->tbd_base = (struct fec_bd *)base;
+	/*
+	 * Allocate receive descriptors. This allocation respects cache
+	 * alignment.
+	 */
+	if (!fec->rbd_base) {
+		size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
+				CONFIG_FEC_DATA_ALIGNMENT);
+		fec->rbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
+		if (!fec->rbd_base) {
+			ret = -ENOMEM;
+			goto err2;
+		}
+		memset(fec->rbd_base, 0, size);
+	}
 
 	/*
 	 * Set interrupt mask register
@@ -570,9 +625,8 @@  static int fec_init(struct eth_device *dev, bd_t* bd)
 	 * Initialize RxBD/TxBD rings
 	 */
 	if (fec_rbd_init(fec, FEC_RBD_NUM, FEC_MAX_PKT_SIZE) < 0) {
-		free(fec->base_ptr);
-		fec->base_ptr = NULL;
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err3;
 	}
 	fec_tbd_init(fec);
 
@@ -583,6 +637,13 @@  static int fec_init(struct eth_device *dev, bd_t* bd)
 #endif
 	fec_open(dev);
 	return 0;
+
+err3:
+	free(fec->rbd_base);
+err2:
+	free(fec->tbd_base);
+err1:
+	return ret;
 }
 
 /**
@@ -631,9 +692,11 @@  static void fec_halt(struct eth_device *dev)
  * @param[in] length Data count in bytes
  * @return 0 on success
  */
-static int fec_send(struct eth_device *dev, volatile void* packet, int length)
+static int fec_send(struct eth_device *dev, void *packet, int length)
 {
 	unsigned int status;
+	uint32_t size;
+	uint32_t addr;
 
 	/*
 	 * This routine transmits one frame.  This routine only accepts
@@ -650,15 +713,21 @@  static int fec_send(struct eth_device *dev, volatile void* packet, int length)
 	}
 
 	/*
-	 * Setup the transmit buffer
-	 * Note: We are always using the first buffer for transmission,
-	 * the second will be empty and only used to stop the DMA engine
+	 * Setup the transmit buffer. We are always using the first buffer for
+	 * transmission, the second will be empty and only used to stop the DMA
+	 * engine. We also flush the packet to RAM here to avoid cache trouble.
 	 */
 #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
 	swap_packet((uint32_t *)packet, length);
 #endif
+
+	addr = (uint32_t)packet;
+	size = roundup(length, CONFIG_FEC_DATA_ALIGNMENT);
+	flush_dcache_range(addr, addr + size);
+
 	writew(length, &fec->tbd_base[fec->tbd_index].data_length);
-	writel((uint32_t)packet, &fec->tbd_base[fec->tbd_index].data_pointer);
+	writel(addr, &fec->tbd_base[fec->tbd_index].data_pointer);
+
 	/*
 	 * update BD's status now
 	 * This block:
@@ -672,16 +741,30 @@  static int fec_send(struct eth_device *dev, volatile void* packet, int length)
 	writew(status, &fec->tbd_base[fec->tbd_index].status);
 
 	/*
+	 * Flush data cache. This code flushes both TX descriptors to RAM.
+	 * After this code this code, the descritors will be safely in RAM
+	 * and we can start DMA.
+	 */
+	size = roundup(2 * sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
+	addr = (uint32_t)fec->tbd_base;
+	flush_dcache_range(addr, addr + size);
+
+	/*
 	 * Enable SmartDMA transmit task
 	 */
 	fec_tx_task_enable(fec);
 
 	/*
-	 * wait until frame is sent .
+	 * Wait until frame is sent. On each turn of the wait cycle, we must
+	 * invalidate data cache to see what's really in RAM. Also, we need
+	 * barrier here.
 	 */
+	invalidate_dcache_range(addr, addr + size);
 	while (readw(&fec->tbd_base[fec->tbd_index].status) & FEC_TBD_READY) {
 		udelay(1);
+		invalidate_dcache_range(addr, addr + size);
 	}
+
 	debug("fec_send: status 0x%x index %d\n",
 			readw(&fec->tbd_base[fec->tbd_index].status),
 			fec->tbd_index);
@@ -707,6 +790,8 @@  static int fec_recv(struct eth_device *dev)
 	int frame_length, len = 0;
 	struct nbuf *frame;
 	uint16_t bd_status;
+	uint32_t addr, size;
+	int i;
 	uchar buff[FEC_MAX_PKT_SIZE];
 
 	/*
@@ -737,8 +822,23 @@  static int fec_recv(struct eth_device *dev)
 	}
 
 	/*
-	 * ensure reading the right buffer status
+	 * Read the buffer status. Before the status can be read, the data cache
+	 * must be invalidated, because the data in RAM might have been changed
+	 * by DMA. The descriptors are properly aligned to cachelines so there's
+	 * no need to worry they'd overlap.
+	 *
+	 * WARNING: By invalidating the descriptor here, we also invalidate
+	 * the descriptors surrounding this one. Therefore we can NOT change the
+	 * contents of this descriptor nor the surrounding ones. The problem is
+	 * that in order to mark the descriptor as processed, we need to change
+	 * the descriptor. The solution is to mark the whole cache line when all
+	 * descriptors in the cache line are processed.
 	 */
+	addr = (uint32_t)rbd;
+	addr &= ~(CONFIG_FEC_DATA_ALIGNMENT - 1);
+	size = roundup(sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
+	invalidate_dcache_range(addr, addr + size);
+
 	bd_status = readw(&rbd->status);
 	debug("fec_recv: status 0x%x\n", bd_status);
 
@@ -751,6 +851,13 @@  static int fec_recv(struct eth_device *dev)
 			frame = (struct nbuf *)readl(&rbd->data_pointer);
 			frame_length = readw(&rbd->data_length) - 4;
 			/*
+			 * Invalidate data cache over the buffer
+			 */
+			addr = (uint32_t)frame;
+			size = roundup(frame_length, CONFIG_FEC_DATA_ALIGNMENT);
+			invalidate_dcache_range(addr, addr + size);
+
+			/*
 			 *  Fill the buffer and pass it to upper layers
 			 */
 #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
@@ -765,11 +872,27 @@  static int fec_recv(struct eth_device *dev)
 						(ulong)rbd->data_pointer,
 						bd_status);
 		}
+
 		/*
-		 * free the current buffer, restart the engine
-		 * and move forward to the next buffer
+		 * Free the current buffer, restart the engine and move forward
+		 * to the next buffer. Here we check if the whole cacheline of
+		 * descriptors was already processed and if so, we mark it free
+		 * as whole.
 		 */
-		fec_rbd_clean(fec->rbd_index == (FEC_RBD_NUM - 1) ? 1 : 0, rbd);
+		size = (CONFIG_FEC_DATA_ALIGNMENT / sizeof(struct fec_bd)) - 1;
+		if ((fec->rbd_index & size) == size) {
+			i = fec->rbd_index - size;
+			addr = (uint32_t)&fec->rbd_base[i];
+			for (; i <= fec->rbd_index + size; i++) {
+				if (i == (FEC_RBD_NUM - 1))
+					fec_rbd_clean(1, &fec->rbd_base[i]);
+				else
+					fec_rbd_clean(0, &fec->rbd_base[i]);
+			}
+			flush_dcache_range(addr,
+				addr + CONFIG_FEC_DATA_ALIGNMENT);
+		}
+
 		fec_rx_task_enable(fec);
 		fec->rbd_index = (fec->rbd_index + 1) % FEC_RBD_NUM;
 	}
diff --git a/drivers/net/fec_mxc.h b/drivers/net/fec_mxc.h
index 2eb7803..852b2e0 100644
--- a/drivers/net/fec_mxc.h
+++ b/drivers/net/fec_mxc.h
@@ -234,22 +234,6 @@  struct ethernet_regs {
 #endif
 
 /**
- * @brief Descriptor buffer alignment
- *
- * i.MX27 requires a 16 byte alignment (but for the first element only)
- */
-#define DB_ALIGNMENT		16
-
-/**
- * @brief Data buffer alignment
- *
- * i.MX27 requires a four byte alignment for transmit and 16 bits
- * alignment for receive so take 16
- * Note: Valid for member data_pointer in struct buffer_descriptor
- */
-#define DB_DATA_ALIGNMENT	16
-
-/**
  * @brief Receive & Transmit Buffer Descriptor definitions
  *
  * Note: The first BD must be aligned (see DB_ALIGNMENT)
@@ -282,8 +266,7 @@  struct fec_priv {
 	struct fec_bd *tbd_base;	/* TBD ring */
 	int tbd_index;			/* next transmit BD to write */
 	bd_t *bd;
-	void *rdb_ptr;
-	void *base_ptr;
+	uint8_t *tdb_ptr;
 	int dev_id;
 	int phy_id;
 	struct mii_dev *bus;