diff mbox

[3/3] net: hisilicon: new hip04 ethernet driver

Message ID 1395132017-15928-4-git-send-email-zhangfei.gao@linaro.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Zhangfei Gao March 18, 2014, 8:40 a.m. UTC
Support Hisilicon hip04 ethernet driver, including 100M / 1000M controller

Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
---
 drivers/net/ethernet/hisilicon/Makefile    |    2 +-
 drivers/net/ethernet/hisilicon/hip04_eth.c |  717 ++++++++++++++++++++++++++++
 2 files changed, 718 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_eth.c

Comments

Russell King - ARM Linux March 18, 2014, 10:46 a.m. UTC | #1
I was just browsing this patch when I noticed some of these issues - I
haven't done a full review of this driver, I'm just commenting on the
things I've spotted.

On Tue, Mar 18, 2014 at 04:40:17PM +0800, Zhangfei Gao wrote:
> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
> +{
> +	struct hip04_priv *priv = container_of(napi,
> +			      struct hip04_priv, napi);
> +	struct net_device *ndev = priv->ndev;
> +	struct sk_buff *skb;
> +	struct rx_desc *desc;
> +	unsigned char *buf;
> +	int rx = 0;
> +	unsigned int cnt = hip04_recv_cnt(priv);
> +	unsigned int len, tmp[16];
> +
> +	while (cnt) {
> +		buf = priv->rx_buf[priv->rx_head];
> +		skb = build_skb(buf, priv->rx_buf_size);
> +		if (unlikely(!skb))
> +			net_dbg_ratelimited("build_skb failed\n");
> +		dma_map_single(&ndev->dev, skb->data,
> +			RX_BUF_SIZE, DMA_FROM_DEVICE);

This is incorrect.

buf = buffer alloc()
/* CPU owns buffer and can read/write it, device does not */
dev_addr = dma_map_single(dev, buf, ..., DMA_FROM_DEVICE);
/* Device owns buffer and can write it, CPU does not access it */
dma_unmap_single(dev, dev_addr, ..., DMA_FROM_DEVICE);
/* CPU owns buffer again and can read/write it, device does not */

Please turn on DMA API debugging in the kernel debug options and verify
whether your driver causes it to complain (it will.)

I think you want dma_unmap_single() here.

> +		memcpy(tmp, skb->data, 64);
> +		endian_change((void *)tmp, 64);
> +		desc = (struct rx_desc *)tmp;
> +		len = desc->pkt_len;

This is a rather expensive way to do this.  Presumably the descriptors
are always big endian?  If so, why not:

		desc = skb->data;
		len = be16_to_cpu(desc->pkt_len);

?  You may need to lay the struct out differently for this to work so
the offset which pkt_len accesses is correct.

Also... do you not have any flags which indicate whether the packet
received was in error?

> +
> +		if (len > RX_BUF_SIZE)
> +			len = RX_BUF_SIZE;
> +		if (0 == len)
> +			break;
> +
> +		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
> +		skb_put(skb, len);
> +		skb->protocol = eth_type_trans(skb, ndev);
> +		napi_gro_receive(&priv->napi, skb);
> +
> +		buf = netdev_alloc_frag(priv->rx_buf_size);
> +		if (!buf)
> +			return -ENOMEM;
> +		priv->rx_buf[priv->rx_head] = buf;
> +		dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
> +		hip04_set_recv_desc(priv, virt_to_phys(buf));

No need for virt_to_phys() here - dma_map_single() returns the device
address.

> +
> +		priv->rx_head = RX_NEXT(priv->rx_head);
> +		if (rx++ >= budget)
> +			break;
> +
> +		if (--cnt == 0)
> +			cnt = hip04_recv_cnt(priv);
> +	}
> +
> +	if (rx < budget) {
> +		napi_gro_flush(napi, false);
> +		__napi_complete(napi);
> +	}
> +
> +	/* enable rx interrupt */
> +	priv->reg_inten |= RCV_INT | RCV_NOBUF;
> +	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);

This doesn't look right - you're supposed to re-enable receive interrupts
when you receive less than "budget" packets.

> +static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id)
> +{
> +	struct net_device *ndev = (struct net_device *) dev_id;
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
> +	u32 val = DEF_INT_MASK;
> +
> +	writel_relaxed(val, priv->base + PPE_RINT);
> +
> +	if ((ists & RCV_INT) || (ists & RCV_NOBUF)) {

What you get with this is the compiler generating code to test RCV_INT,
and then if that's false, code to test RCV_NOBUF.  There's no possibility
for the compiler to optimise that because it's part of the language spec
that condition1 || condition2 will always have condition1 evaluated first,
and condition2 will only be evaluated if condition1 was false.

	if (ists & (RCV_INT | RCV_NOBUF)) {

would more than likely be more efficient here.

> +		if (napi_schedule_prep(&priv->napi)) {
> +			/* disable rx interrupt */
> +			priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
> +			writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
> +			__napi_schedule(&priv->napi);
> +		}
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void hip04_tx_reclaim(struct net_device *ndev, bool force)
> +{
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	unsigned tx_head = priv->tx_head;
> +	unsigned tx_tail = priv->tx_tail;
> +	struct tx_desc *desc = priv->td_ring[priv->tx_tail];
> +
> +	spin_lock_irq(&priv->txlock);

Do you know for certain that interrupts were (and always will be) definitely
enabled prior to this point?  If not, you should use spin_lock_irqsave()..
spin_unlock_irqrestore().

> +	while (tx_tail != tx_head) {
> +		if (desc->send_addr != 0) {
> +			if (force)
> +				desc->send_addr = 0;
> +			else
> +				break;
> +		}

dma_unmap_single(&ndev->dev, dev_addr, skb->len, DMA_TO_DEVICE) ?

It looks like your device zeros the send address when it has finished
transmitting - if this is true, then you will need to store dev_addr
separately for each transmit packet.

> +		dev_kfree_skb_irq(priv->tx_skb[tx_tail]);
> +		priv->tx_skb[tx_tail] = NULL;
> +		tx_tail = TX_NEXT(tx_tail);
> +		priv->tx_count--;

No processing of transmit statistics?

> +	}
> +	priv->tx_tail = tx_tail;
> +	spin_unlock_irq(&priv->txlock);

If you have freed up any packets, then you should call netif_wake_queue().
Do you not get any interrupts when a packet is transmitted?

> +}
> +
> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> +{
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	struct tx_desc *desc = priv->td_ring[priv->tx_head];
> +	unsigned int tx_head = priv->tx_head;
> +	int ret;
> +
> +	hip04_tx_reclaim(ndev, false);
> +
> +	spin_lock_irq(&priv->txlock);

Same comment here...

> +	if (priv->tx_count++ >= TX_DESC_NUM) {
> +		net_dbg_ratelimited("no TX space for packet\n");
> +		netif_stop_queue(ndev);
> +		ret = NETDEV_TX_BUSY;
> +		goto out_unlock;
> +	}

You shouldn't rely on this - you should stop the queue when you put the
last packet to fill the ring before returning from this function.  When
you clean the ring in your hip04_tx_reclaim() function, to wake the
queue.

> +
> +	priv->tx_skb[tx_head] = skb;
> +	dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> +	memset((void *)desc, 0, sizeof(*desc));
> +	desc->send_addr = (unsigned int)virt_to_phys(skb->data);

Again, dma_map_single() gives you the device address, there's no need
to use virt_to_phys(), and there should be no need for a cast here
either.  Also consider cpu_to_be32() and similar for the other descriptor
writes.

> +	desc->send_size = skb->len;
> +	desc->cfg = DESC_DEF_CFG;
> +	desc->wb_addr = priv->td_phys[tx_head];
> +	endian_change(desc, 64);
> +	skb_tx_timestamp(skb);
> +	hip04_set_xmit_desc(priv, priv->td_phys[tx_head]);
> +
> +	priv->tx_head = TX_NEXT(tx_head);
> +	ret = NETDEV_TX_OK;

As mentioned above, if you have filled the ring, you need to also call
netif_stop_queue() here.

> +static int hip04_mac_probe(struct platform_device *pdev)
> +{
> +	struct device *d = &pdev->dev;
> +	struct device_node *node = d->of_node;
> +	struct net_device *ndev;
> +	struct hip04_priv *priv;
> +	struct resource *res;
> +	unsigned int irq, val;
> +	int ret;
> +
> +	ndev = alloc_etherdev(sizeof(struct hip04_priv));
> +	if (!ndev)
> +		return -ENOMEM;
> +
> +	priv = netdev_priv(ndev);
> +	priv->ndev = ndev;
> +	platform_set_drvdata(pdev, ndev);
> +	spin_lock_init(&priv->txlock);
> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	if (!res) {
> +		ret = -EINVAL;
> +		goto init_fail;
> +	}
> +	ndev->base_addr = res->start;
> +	priv->base = devm_ioremap_resource(d, res);
> +	ret = IS_ERR(priv->base);
> +	if (ret) {
> +		dev_err(d, "devm_ioremap_resource failed\n");
> +		goto init_fail;
> +	}

If you're using devm_ioremap_resource(), you don't need to check the
resource above.  In any case, returning the value from IS_ERR() from
this function is not correct.

	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	priv->base = devm_ioremap_resource(d, res);
	if (IS_ERR(priv->base) {
		ret = PTR_ERR(priv->base);
		goto init_fail;
	}

You don't need to fill in ndev->base_addr (many drivers don't.)
Arnd Bergmann March 18, 2014, 11:25 a.m. UTC | #2
On Tuesday 18 March 2014 16:40:17 Zhangfei Gao wrote:

> +
> +static void __iomem *ppebase;

The global 'ppebase' seems hacky. Isn't that a SoC-specific register area, while
the rest of the driver is reusable across SoCs?

What does 'ppe' stand for?

What if there are multiple instances of this, which each have their own ppebase?

> +static void hip04_config_port(struct hip04_priv *priv, u32 speed, u32 duplex)
> +{
> +	u32 val;
> +
> +	priv->speed = speed;
> +	priv->duplex = duplex;
> +
> +	switch (speed) {
> +	case SPEED_1000:
> +		val = 8;
> +		break;
> +	case SPEED_100:
> +		if (priv->id)
> +			val = 7;
> +		else
> +			val = 1;
> +		break;
> +	default:
> +		val = 0;
> +		break;
> +	}
> +	writel_relaxed(val, priv->base + GE_PORT_MODE)

This also seems to encode knowledge about a particular implementation
into the driver. Maybe it's better to add a property for the port
mode?


> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
> +{
> +	writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
> +}
> +
> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
> +{
> +	writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
> +}
> +
> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
> +{
> +	return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
> +}

At the very least, the hip04_set_xmit_desc() function needs to use 'writel'
rather than 'writel_relaxed'. Otherwise data that is being sent out
can be stuck in the CPU's write buffers and you send stale data on the wire.

For the receive path, you may or may not need to use 'readl', depending
on how DMA is serialized by this device. If you have MSI interrupts, the
interrupt message should already do the serialization, but if you have
edge or level triggered interrupts, you normally need to have one readl()
from the device register between the IRQ and the data access.


> +static void endian_change(void *p, int size)
> +{
> +	unsigned int *to_cover = (unsigned int *)p;
> +	int i;
> +
> +	size = size >> 2;
> +	for (i = 0; i < size; i++)
> +		*(to_cover+i) = htonl(*(to_cover+i));
> +}
> +
> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
> +{
> +	struct hip04_priv *priv = container_of(napi,
> +			      struct hip04_priv, napi);
> +	struct net_device *ndev = priv->ndev;
> +	struct sk_buff *skb;
> +	struct rx_desc *desc;
> +	unsigned char *buf;
> +	int rx = 0;
> +	unsigned int cnt = hip04_recv_cnt(priv);
> +	unsigned int len, tmp[16];
> +
> +	while (cnt) {
> +		buf = priv->rx_buf[priv->rx_head];
> +		skb = build_skb(buf, priv->rx_buf_size);
> +		if (unlikely(!skb))
> +			net_dbg_ratelimited("build_skb failed\n");
> +		dma_map_single(&ndev->dev, skb->data,
> +			RX_BUF_SIZE, DMA_FROM_DEVICE);
> +		memcpy(tmp, skb->data, 64);
> +		endian_change((void *)tmp, 64);
> +		desc = (struct rx_desc *)tmp;
> +		len = desc->pkt_len;

The dma_map_single() seems misplaced here, for all I can tell, the
data has already been transferred. Maybe you mean dma_unmap_single?

I don't see why you copy 64 bytes out of the buffer using endian_change,
rather than just looking at the first word, which seems to have the
only value you are interested in.

> +		if (len > RX_BUF_SIZE)
> +			len = RX_BUF_SIZE;
> +		if (0 == len)
> +			break;
> +
> +		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
> +		skb_put(skb, len);
> +		skb->protocol = eth_type_trans(skb, ndev);
> +		napi_gro_receive(&priv->napi, skb);
> +
> +		buf = netdev_alloc_frag(priv->rx_buf_size);
> +		if (!buf)
> +			return -ENOMEM;
> +		priv->rx_buf[priv->rx_head] = buf;
> +		dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);

Maybe you mean DMA_FROM_DEVICE? The call here doesn't seem to make any
sense. You also need to use the return value of dma_map_single() every
time you call it.

> +		hip04_set_recv_desc(priv, virt_to_phys(buf));

and put it right here in the next line. virt_to_phys() is not the correct
function call, that is what dma_map_single() is meant for.

> +		priv->rx_head = RX_NEXT(priv->rx_head);
> +		if (rx++ >= budget)
> +			break;
> +
> +		if (--cnt == 0)
> +			cnt = hip04_recv_cnt(priv);
> +	}
> +
> +	if (rx < budget) {
> +		napi_gro_flush(napi, false);
> +		__napi_complete(napi);
> +	}
> +
> +	/* enable rx interrupt */
> +	priv->reg_inten |= RCV_INT | RCV_NOBUF;
> +	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);


Why do you unconditionally turn on interrupts here? Shouldn't you
only do that after calling napi_complete()?


> +
> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> +{
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	struct tx_desc *desc = priv->td_ring[priv->tx_head];
> +	unsigned int tx_head = priv->tx_head;
> +	int ret;
> +
> +	hip04_tx_reclaim(ndev, false);
> +
> +	spin_lock_irq(&priv->txlock);
> +	if (priv->tx_count++ >= TX_DESC_NUM) {
> +		net_dbg_ratelimited("no TX space for packet\n");
> +		netif_stop_queue(ndev);
> +		ret = NETDEV_TX_BUSY;
> +		goto out_unlock;
> +	}
> +
> +	priv->tx_skb[tx_head] = skb;
> +	dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> +	memset((void *)desc, 0, sizeof(*desc));
> +	desc->send_addr = (unsigned int)virt_to_phys(skb->data);

Just like above: you must not use virt_to_phys here, but rather use
the output of dma_map_single.

IIRC, you can't generally call dma_map_single() under a spinlock, so
better move that ahead. It may also be a slow operation.

> +
> +static int hip04_mac_open(struct net_device *ndev)
> +{
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	int i;
> +
> +	hip04_reset_ppe(priv);
> +	for (i = 0; i < RX_DESC_NUM; i++) {
> +		dma_map_single(&ndev->dev, priv->rx_buf[i],
> +				RX_BUF_SIZE, DMA_TO_DEVICE);
> +		hip04_set_recv_desc(priv, virt_to_phys(priv->rx_buf[i]));
> +	}

And one more. Also DMA_FROM_DEVICE.

> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
> +{
> +	struct hip04_priv *priv = netdev_priv(ndev);
> +	int i;
> +
> +	priv->rx_buf_size = RX_BUF_SIZE +
> +			    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +
> +	priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
> +				SKB_DATA_ALIGN(sizeof(struct tx_desc)),	0);
> +	if (!priv->desc_pool)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < TX_DESC_NUM; i++) {
> +		priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
> +					GFP_ATOMIC, &priv->td_phys[i]);
> +		if (!priv->td_ring[i])
> +			return -ENOMEM;
> +	}

Why do you create a dma pool here, when you do all the allocations upfront?

It looks to me like you could simply turn the td_ring array of pointers
to tx descriptors into a an array of tx descriptors (no pointers) and allocate
that one using dma_alloc_coherent.


> +	if (!ppebase) {
> +		struct device_node *n;
> +
> +		n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
> +		if (!n) {
> +			ret = -EINVAL;
> +			netdev_err(ndev, "not find hisilicon,ppebase\n");
> +			goto init_fail;
> +		}
> +		ppebase = of_iomap(n, 0);
> +	}

How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
a more generic abstraction of the ppe, and stick the port and id in there as
well, e.g.

	ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id 

> +	ret = of_property_read_u32(node, "speed", &val);
> +	if (ret) {
> +		dev_warn(d, "not find speed info\n");
> +		priv->speed = SPEED_1000;
> +	}
> +
> +	if (SPEED_100 == val)
> +		priv->speed = SPEED_100;
> +	else
> +		priv->speed = SPEED_1000;
> +	priv->duplex = DUPLEX_FULL;

Why do you even need the speed here, shouldn't you get that information
from the phy through hip04_adjust_link?

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 20, 2014, 9:51 a.m. UTC | #3
Dear Russell

Thanks for sparing time and giving so many perfect suggestion, really helpful.

On Tue, Mar 18, 2014 at 6:46 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> I was just browsing this patch when I noticed some of these issues - I
> haven't done a full review of this driver, I'm just commenting on the
> things I've spotted.
>
> On Tue, Mar 18, 2014 at 04:40:17PM +0800, Zhangfei Gao wrote:
>> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
>> +{
>> +     struct hip04_priv *priv = container_of(napi,
>> +                           struct hip04_priv, napi);
>> +     struct net_device *ndev = priv->ndev;
>> +     struct sk_buff *skb;
>> +     struct rx_desc *desc;
>> +     unsigned char *buf;
>> +     int rx = 0;
>> +     unsigned int cnt = hip04_recv_cnt(priv);
>> +     unsigned int len, tmp[16];
>> +
>> +     while (cnt) {
>> +             buf = priv->rx_buf[priv->rx_head];
>> +             skb = build_skb(buf, priv->rx_buf_size);
>> +             if (unlikely(!skb))
>> +                     net_dbg_ratelimited("build_skb failed\n");
>> +             dma_map_single(&ndev->dev, skb->data,
>> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
>
> This is incorrect.
>
> buf = buffer alloc()
> /* CPU owns buffer and can read/write it, device does not */
> dev_addr = dma_map_single(dev, buf, ..., DMA_FROM_DEVICE);
> /* Device owns buffer and can write it, CPU does not access it */
> dma_unmap_single(dev, dev_addr, ..., DMA_FROM_DEVICE);
> /* CPU owns buffer again and can read/write it, device does not */
>
> Please turn on DMA API debugging in the kernel debug options and verify
> whether your driver causes it to complain (it will.)

Yes, you are right.
After change to dma_map/unmap_single, however, still get warning like
"DMA-API: device driver failed to check map error", not sure whether
it can be ignored?

>
> I think you want dma_unmap_single() here.
>
>> +             memcpy(tmp, skb->data, 64);
>> +             endian_change((void *)tmp, 64);
>> +             desc = (struct rx_desc *)tmp;
>> +             len = desc->pkt_len;
>
> This is a rather expensive way to do this.  Presumably the descriptors
> are always big endian?  If so, why not:
>
>                 desc = skb->data;
>                 len = be16_to_cpu(desc->pkt_len);
>
> ?  You may need to lay the struct out differently for this to work so
> the offset which pkt_len accesses is correct.

Great, it is what I am looking for.
Not thought of changing layout of struct before.

>
> Also... do you not have any flags which indicate whether the packet
> received was in error?
>
>> +
>> +             if (len > RX_BUF_SIZE)
>> +                     len = RX_BUF_SIZE;
>> +             if (0 == len)
>> +                     break;
>> +
>> +             skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
>> +             skb_put(skb, len);
>> +             skb->protocol = eth_type_trans(skb, ndev);
>> +             napi_gro_receive(&priv->napi, skb);
>> +
>> +             buf = netdev_alloc_frag(priv->rx_buf_size);
>> +             if (!buf)
>> +                     return -ENOMEM;
>> +             priv->rx_buf[priv->rx_head] = buf;
>> +             dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
>> +             hip04_set_recv_desc(priv, virt_to_phys(buf));
>
> No need for virt_to_phys() here - dma_map_single() returns the device
> address.
Got it.
Use virt_to_phys since find same result come out, it should be
different for iommu case.

In fact, the hardware can help to do the cache flushing, the function
still not be enabled now.
Then dma_map/unmap_single may be ignored.

>
>> +
>> +             priv->rx_head = RX_NEXT(priv->rx_head);
>> +             if (rx++ >= budget)
>> +                     break;
>> +
>> +             if (--cnt == 0)
>> +                     cnt = hip04_recv_cnt(priv);
>> +     }
>> +
>> +     if (rx < budget) {
>> +             napi_gro_flush(napi, false);
>> +             __napi_complete(napi);
>> +     }
>> +
>> +     /* enable rx interrupt */
>> +     priv->reg_inten |= RCV_INT | RCV_NOBUF;
>> +     writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>
> This doesn't look right - you're supposed to re-enable receive interrupts
> when you receive less than "budget" packets.
Yes, got it.

>
>> +static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id)
>> +{
>> +     struct net_device *ndev = (struct net_device *) dev_id;
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
>> +     u32 val = DEF_INT_MASK;
>> +
>> +     writel_relaxed(val, priv->base + PPE_RINT);
>> +
>> +     if ((ists & RCV_INT) || (ists & RCV_NOBUF)) {
>
> What you get with this is the compiler generating code to test RCV_INT,
> and then if that's false, code to test RCV_NOBUF.  There's no possibility
> for the compiler to optimise that because it's part of the language spec
> that condition1 || condition2 will always have condition1 evaluated first,
> and condition2 will only be evaluated if condition1 was false.
>
>         if (ists & (RCV_INT | RCV_NOBUF)) {
>
> would more than likely be more efficient here.

Cool, never think about this optimization.

>
>> +             if (napi_schedule_prep(&priv->napi)) {
>> +                     /* disable rx interrupt */
>> +                     priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
>> +                     writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>> +                     __napi_schedule(&priv->napi);
>> +             }
>> +     }
>> +
>> +     return IRQ_HANDLED;
>> +}
>> +
>> +static void hip04_tx_reclaim(struct net_device *ndev, bool force)
>> +{
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     unsigned tx_head = priv->tx_head;
>> +     unsigned tx_tail = priv->tx_tail;
>> +     struct tx_desc *desc = priv->td_ring[priv->tx_tail];
>> +
>> +     spin_lock_irq(&priv->txlock);
>
> Do you know for certain that interrupts were (and always will be) definitely
> enabled prior to this point?  If not, you should use spin_lock_irqsave()..
> spin_unlock_irqrestore().
Yes,
After double check, I found spin_lock can be ignored at all here,
since xmit is protected by rcu.

>
>> +     while (tx_tail != tx_head) {
>> +             if (desc->send_addr != 0) {
>> +                     if (force)
>> +                             desc->send_addr = 0;
>> +                     else
>> +                             break;
>> +             }
>
> dma_unmap_single(&ndev->dev, dev_addr, skb->len, DMA_TO_DEVICE) ?
>
> It looks like your device zeros the send address when it has finished
> transmitting - if this is true, then you will need to store dev_addr
> separately for each transmit packet.

Yes, dev_addr is cleared after transmission over, and should be stored
for unmap.

>
>> +             dev_kfree_skb_irq(priv->tx_skb[tx_tail]);
>> +             priv->tx_skb[tx_tail] = NULL;
>> +             tx_tail = TX_NEXT(tx_tail);
>> +             priv->tx_count--;
>
> No processing of transmit statistics?
Will add it.

>
>> +     }
>> +     priv->tx_tail = tx_tail;
>> +     spin_unlock_irq(&priv->txlock);
>
> If you have freed up any packets, then you should call netif_wake_queue().
> Do you not get any interrupts when a packet is transmitted?

Unfortunately, there is no interrupt after packet is transmitted.
Reclaim only in xmit itself, so it may no need to
netif_wake_queue/netif_stop_queue.

>
>> +}
>> +
>> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>> +{
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     struct tx_desc *desc = priv->td_ring[priv->tx_head];
>> +     unsigned int tx_head = priv->tx_head;
>> +     int ret;
>> +
>> +     hip04_tx_reclaim(ndev, false);
>> +
>> +     spin_lock_irq(&priv->txlock);
>
> Same comment here...
>
>> +     if (priv->tx_count++ >= TX_DESC_NUM) {
>> +             net_dbg_ratelimited("no TX space for packet\n");
>> +             netif_stop_queue(ndev);
>> +             ret = NETDEV_TX_BUSY;
>> +             goto out_unlock;
>> +     }
>
> You shouldn't rely on this - you should stop the queue when you put the
> last packet to fill the ring before returning from this function.  When
> you clean the ring in your hip04_tx_reclaim() function, to wake the
> queue.
Since no finished interrupt to trigger again, and to wake the queue,
so only check in xmit.

>
>> +
>> +     priv->tx_skb[tx_head] = skb;
>> +     dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> +     memset((void *)desc, 0, sizeof(*desc));
>> +     desc->send_addr = (unsigned int)virt_to_phys(skb->data);
>
> Again, dma_map_single() gives you the device address, there's no need
> to use virt_to_phys(), and there should be no need for a cast here
> either.  Also consider cpu_to_be32() and similar for the other descriptor
> writes.
Yes, cpu_to_be32 can do save the memcpy.

>
>> +     desc->send_size = skb->len;
>> +     desc->cfg = DESC_DEF_CFG;
>> +     desc->wb_addr = priv->td_phys[tx_head];
>> +     endian_change(desc, 64);
>> +     skb_tx_timestamp(skb);
>> +     hip04_set_xmit_desc(priv, priv->td_phys[tx_head]);
>> +
>> +     priv->tx_head = TX_NEXT(tx_head);
>> +     ret = NETDEV_TX_OK;
>
> As mentioned above, if you have filled the ring, you need to also call
> netif_stop_queue() here.
For rx, the basic idea is always have RX_DESC_NUM buffers in the pool.
When buffer is used, immediately alloc a new one and add to the end of pool.

>
>> +static int hip04_mac_probe(struct platform_device *pdev)
>> +{
>> +     struct device *d = &pdev->dev;
>> +     struct device_node *node = d->of_node;
>> +     struct net_device *ndev;
>> +     struct hip04_priv *priv;
>> +     struct resource *res;
>> +     unsigned int irq, val;
>> +     int ret;
>> +
>> +     ndev = alloc_etherdev(sizeof(struct hip04_priv));
>> +     if (!ndev)
>> +             return -ENOMEM;
>> +
>> +     priv = netdev_priv(ndev);
>> +     priv->ndev = ndev;
>> +     platform_set_drvdata(pdev, ndev);
>> +     spin_lock_init(&priv->txlock);
>> +     res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>> +     if (!res) {
>> +             ret = -EINVAL;
>> +             goto init_fail;
>> +     }
>> +     ndev->base_addr = res->start;
>> +     priv->base = devm_ioremap_resource(d, res);
>> +     ret = IS_ERR(priv->base);
>> +     if (ret) {
>> +             dev_err(d, "devm_ioremap_resource failed\n");
>> +             goto init_fail;
>> +     }
>
> If you're using devm_ioremap_resource(), you don't need to check the
> resource above.  In any case, returning the value from IS_ERR() from
> this function is not correct.

Got it, good idea.

>
>         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>         priv->base = devm_ioremap_resource(d, res);
>         if (IS_ERR(priv->base) {
>                 ret = PTR_ERR(priv->base);
>                 goto init_fail;
>         }
>
> You don't need to fill in ndev->base_addr (many drivers don't.)
OK, got it.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 20, 2014, 2 p.m. UTC | #4
Dear Arnd

On Tue, Mar 18, 2014 at 7:25 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Tuesday 18 March 2014 16:40:17 Zhangfei Gao wrote:
>
>> +
>> +static void __iomem *ppebase;
>
> The global 'ppebase' seems hacky. Isn't that a SoC-specific register area, while
> the rest of the driver is reusable across SoCs?
>
> What does 'ppe' stand for?
>
> What if there are multiple instances of this, which each have their own ppebase?

In this specific platform,
ppe is the only module controlling all the fifos for all the net controller.
And each controller connect to specific port.
ppe has 2048 channels, sharing by all the controller, only if not overlapped.

So the static ppebase is required, which I don't like too.
Two inputs required, one is port, which is connect to the controller.
The other is start channel, currently I use id, and start channel is
RX_DESC_NUM * priv->id;  /* start_addr */

>
>> +static void hip04_config_port(struct hip04_priv *priv, u32 speed, u32 duplex)
>> +{
>> +     u32 val;
>> +
>> +     priv->speed = speed;
>> +     priv->duplex = duplex;
>> +
>> +     switch (speed) {
>> +     case SPEED_1000:
>> +             val = 8;
>> +             break;
>> +     case SPEED_100:
>> +             if (priv->id)
>> +                     val = 7;
>> +             else
>> +                     val = 1;
>> +             break;
>> +     default:
>> +             val = 0;
>> +             break;
>> +     }
>> +     writel_relaxed(val, priv->base + GE_PORT_MODE)
>
> This also seems to encode knowledge about a particular implementation
> into the driver. Maybe it's better to add a property for the port
> mode?

After check Documentation/devicetree/bindings/net/ethernet.txt,
I think phy-mode is more suitable here.

        switch (priv->phy_mode) {
        case PHY_INTERFACE_MODE_SGMII:
                if (speed == SPEED_1000)
                        val = 8;
                else
                        val = 7;
                break;
        case PHY_INTERFACE_MODE_MII:
                val = 1;        /* SPEED_100 */
                break;
        default:
                val = 0;
                break;
        }
        writel_relaxed(val, priv->base + GE_PORT_MODE);

probe:
priv->phy_mode = of_get_phy_mode(node);

>
>
>> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
>> +{
>> +     writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
>> +}
>> +
>> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
>> +{
>> +     writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
>> +}
>> +
>> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
>> +{
>> +     return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
>> +}
>
> At the very least, the hip04_set_xmit_desc() function needs to use 'writel'
> rather than 'writel_relaxed'. Otherwise data that is being sent out
> can be stuck in the CPU's write buffers and you send stale data on the wire.
>
> For the receive path, you may or may not need to use 'readl', depending
> on how DMA is serialized by this device. If you have MSI interrupts, the
> interrupt message should already do the serialization, but if you have
> edge or level triggered interrupts, you normally need to have one readl()
> from the device register between the IRQ and the data access.

Really, will update to readl/wirtel in xmit and hip04_rx_poll.
I just got impression, use *_relaxed as much as possible for better performance.

>
>
>> +static void endian_change(void *p, int size)
>> +{
>> +     unsigned int *to_cover = (unsigned int *)p;
>> +     int i;
>> +
>> +     size = size >> 2;
>> +     for (i = 0; i < size; i++)
>> +             *(to_cover+i) = htonl(*(to_cover+i));
>> +}
>> +
>> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
>> +{
>> +     struct hip04_priv *priv = container_of(napi,
>> +                           struct hip04_priv, napi);
>> +     struct net_device *ndev = priv->ndev;
>> +     struct sk_buff *skb;
>> +     struct rx_desc *desc;
>> +     unsigned char *buf;
>> +     int rx = 0;
>> +     unsigned int cnt = hip04_recv_cnt(priv);
>> +     unsigned int len, tmp[16];
>> +
>> +     while (cnt) {
>> +             buf = priv->rx_buf[priv->rx_head];
>> +             skb = build_skb(buf, priv->rx_buf_size);
>> +             if (unlikely(!skb))
>> +                     net_dbg_ratelimited("build_skb failed\n");
>> +             dma_map_single(&ndev->dev, skb->data,
>> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
>> +             memcpy(tmp, skb->data, 64);
>> +             endian_change((void *)tmp, 64);
>> +             desc = (struct rx_desc *)tmp;
>> +             len = desc->pkt_len;
>
> The dma_map_single() seems misplaced here, for all I can tell, the
> data has already been transferred. Maybe you mean dma_unmap_single?
>
> I don't see why you copy 64 bytes out of the buffer using endian_change,
> rather than just looking at the first word, which seems to have the
> only value you are interested in.
Russell suggested using be16_to_cpu etc to replace memcpy.

>
>> +             if (len > RX_BUF_SIZE)
>> +                     len = RX_BUF_SIZE;
>> +             if (0 == len)
>> +                     break;
>> +
>> +             skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
>> +             skb_put(skb, len);
>> +             skb->protocol = eth_type_trans(skb, ndev);
>> +             napi_gro_receive(&priv->napi, skb);
>> +
>> +             buf = netdev_alloc_frag(priv->rx_buf_size);
>> +             if (!buf)
>> +                     return -ENOMEM;
>> +             priv->rx_buf[priv->rx_head] = buf;
>> +             dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
>
> Maybe you mean DMA_FROM_DEVICE? The call here doesn't seem to make any
> sense. You also need to use the return value of dma_map_single() every
> time you call it.
>
>> +             hip04_set_recv_desc(priv, virt_to_phys(buf));
>
> and put it right here in the next line. virt_to_phys() is not the correct
> function call, that is what dma_map_single() is meant for.

OK.
>
>> +             priv->rx_head = RX_NEXT(priv->rx_head);
>> +             if (rx++ >= budget)
>> +                     break;
>> +
>> +             if (--cnt == 0)
>> +                     cnt = hip04_recv_cnt(priv);
>> +     }
>> +
>> +     if (rx < budget) {
>> +             napi_gro_flush(napi, false);
>> +             __napi_complete(napi);
>> +     }
>> +
>> +     /* enable rx interrupt */
>> +     priv->reg_inten |= RCV_INT | RCV_NOBUF;
>> +     writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>
>
> Why do you unconditionally turn on interrupts here? Shouldn't you
> only do that after calling napi_complete()?
Yes, right.

>
>
>> +
>> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>> +{
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     struct tx_desc *desc = priv->td_ring[priv->tx_head];
>> +     unsigned int tx_head = priv->tx_head;
>> +     int ret;
>> +
>> +     hip04_tx_reclaim(ndev, false);
>> +
>> +     spin_lock_irq(&priv->txlock);
>> +     if (priv->tx_count++ >= TX_DESC_NUM) {
>> +             net_dbg_ratelimited("no TX space for packet\n");
>> +             netif_stop_queue(ndev);
>> +             ret = NETDEV_TX_BUSY;
>> +             goto out_unlock;
>> +     }
>> +
>> +     priv->tx_skb[tx_head] = skb;
>> +     dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> +     memset((void *)desc, 0, sizeof(*desc));
>> +     desc->send_addr = (unsigned int)virt_to_phys(skb->data);
>
> Just like above: you must not use virt_to_phys here, but rather use
> the output of dma_map_single.
>
> IIRC, you can't generally call dma_map_single() under a spinlock, so
> better move that ahead. It may also be a slow operation.
The spinlock will be removed here, since it is protected by rcu outside.

>
>> +
>> +static int hip04_mac_open(struct net_device *ndev)
>> +{
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     int i;
>> +
>> +     hip04_reset_ppe(priv);
>> +     for (i = 0; i < RX_DESC_NUM; i++) {
>> +             dma_map_single(&ndev->dev, priv->rx_buf[i],
>> +                             RX_BUF_SIZE, DMA_TO_DEVICE);
>> +             hip04_set_recv_desc(priv, virt_to_phys(priv->rx_buf[i]));
>> +     }
>
> And one more. Also DMA_FROM_DEVICE.
>
>> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
>> +{
>> +     struct hip04_priv *priv = netdev_priv(ndev);
>> +     int i;
>> +
>> +     priv->rx_buf_size = RX_BUF_SIZE +
>> +                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +
>> +     priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>> +                             SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
>> +     if (!priv->desc_pool)
>> +             return -ENOMEM;
>> +
>> +     for (i = 0; i < TX_DESC_NUM; i++) {
>> +             priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
>> +                                     GFP_ATOMIC, &priv->td_phys[i]);
>> +             if (!priv->td_ring[i])
>> +                     return -ENOMEM;
>> +     }
>
> Why do you create a dma pool here, when you do all the allocations upfront?
>
> It looks to me like you could simply turn the td_ring array of pointers
> to tx descriptors into a an array of tx descriptors (no pointers) and allocate
> that one using dma_alloc_coherent.

dma pool used here mainly because of alignment,
the desc has requirement of SKB_DATA_ALIGN,
so use the simplest way
        priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
                                SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);

>
>
>> +     if (!ppebase) {
>> +             struct device_node *n;
>> +
>> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
>> +             if (!n) {
>> +                     ret = -EINVAL;
>> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
>> +                     goto init_fail;
>> +             }
>> +             ppebase = of_iomap(n, 0);
>> +     }
>
> How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
> a more generic abstraction of the ppe, and stick the port and id in there as
> well, e.g.
>
>         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id

To be honest, I am not familiar with syscon_regmap_lookup_by_phandle.

Florian has suggested
              ppe: ppe@28c0000 {
                        compatible = "hisilicon,hip04-ppe";
                        reg = <0x28c0000 0x10000>;
                        #address-cells = <1>;
                        #size-cells = <0>;

                        eth0_port: port@1f {
                                reg = <31>;
                        };

                        eth1_port: port@0 {
                                reg = <0>;
                        };

                        eth2_port: port@8 {
                                reg = <8>;
                        };
                };
                fe: ethernet@28b0000 {
                        compatible = "hisilicon,hip04-mac";
                        reg = <0x28b0000 0x10000>;
                        interrupts = <0 413 4>;
                        phy-mode = "mii";
                        port-handle = <&eth0_port>;
                };
And the port info can be found from port-handle
n = of_parse_phandle(node, "port-handle", 0);
ret = of_property_read_u32(n, "reg", &priv->port);

The id is the controller start channel in ppe, either use alias or
another property in the port.

>
>> +     ret = of_property_read_u32(node, "speed", &val);
>> +     if (ret) {
>> +             dev_warn(d, "not find speed info\n");
>> +             priv->speed = SPEED_1000;
>> +     }
>> +
>> +     if (SPEED_100 == val)
>> +             priv->speed = SPEED_100;
>> +     else
>> +             priv->speed = SPEED_1000;
>> +     priv->duplex = DUPLEX_FULL;
>
> Why do you even need the speed here, shouldn't you get that information
> from the phy through hip04_adjust_link?

There still 100M mac does not have phy, as well adjust_link.
Will use phy-mode instead of speed.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann March 20, 2014, 2:31 p.m. UTC | #5
On Thursday 20 March 2014, Zhangfei Gao wrote:
> On Tue, Mar 18, 2014 at 7:25 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> > On Tuesday 18 March 2014 16:40:17 Zhangfei Gao wrote:
> >
> >> +
> >> +static void __iomem *ppebase;
> >
> > The global 'ppebase' seems hacky. Isn't that a SoC-specific register area, while
> > the rest of the driver is reusable across SoCs?
> >
> > What does 'ppe' stand for?
> >
> > What if there are multiple instances of this, which each have their own ppebase?
> 
> In this specific platform,
> ppe is the only module controlling all the fifos for all the net controller.
> And each controller connect to specific port.
> ppe has 2048 channels, sharing by all the controller, only if not overlapped.
> 
> So the static ppebase is required, which I don't like too.
> Two inputs required, one is port, which is connect to the controller.
> The other is start channel, currently I use id, and start channel is
> RX_DESC_NUM * priv->id;  /* start_addr */

Ok, thanks for the explanation!

> > This also seems to encode knowledge about a particular implementation
> > into the driver. Maybe it's better to add a property for the port
> > mode?
> 
> After check Documentation/devicetree/bindings/net/ethernet.txt,
> I think phy-mode is more suitable here.
> 
>         switch (priv->phy_mode) {
>         case PHY_INTERFACE_MODE_SGMII:
>                 if (speed == SPEED_1000)
>                         val = 8;
>                 else
>                         val = 7;
>                 break;
>         case PHY_INTERFACE_MODE_MII:
>                 val = 1;        /* SPEED_100 */
>                 break;
>         default:
>                 val = 0;
>                 break;
>         }
>         writel_relaxed(val, priv->base + GE_PORT_MODE);
> 
> probe:
> priv->phy_mode = of_get_phy_mode(node);

Yes, this looks better, but where does 'speed' come from now? I assume
even in SGMII mode, you should allow autonegotiation and set this correctly
from the PHY code. Is that what you do here?

> >> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
> >> +{
> >> +     writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
> >> +}
> >> +
> >> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
> >> +{
> >> +     writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
> >> +}
> >> +
> >> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
> >> +{
> >> +     return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
> >> +}
> >
> > At the very least, the hip04_set_xmit_desc() function needs to use 'writel'
> > rather than 'writel_relaxed'. Otherwise data that is being sent out
> > can be stuck in the CPU's write buffers and you send stale data on the wire.
> >
> > For the receive path, you may or may not need to use 'readl', depending
> > on how DMA is serialized by this device. If you have MSI interrupts, the
> > interrupt message should already do the serialization, but if you have
> > edge or level triggered interrupts, you normally need to have one readl()
> > from the device register between the IRQ and the data access.
> 
> Really, will update to readl/wirtel in xmit and hip04_rx_poll.
> I just got impression, use *_relaxed as much as possible for better performance.

Ok. The _relaxed() versions are really meant for people that understand
the ordering requirements. The regular readl/writel accessors contain
extra barriers to make them equivalent to what x86 does.

> >> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
> >> +{
> >> +     struct hip04_priv *priv = container_of(napi,
> >> +                           struct hip04_priv, napi);
> >> +     struct net_device *ndev = priv->ndev;
> >> +     struct sk_buff *skb;
> >> +     struct rx_desc *desc;
> >> +     unsigned char *buf;
> >> +     int rx = 0;
> >> +     unsigned int cnt = hip04_recv_cnt(priv);
> >> +     unsigned int len, tmp[16];
> >> +
> >> +     while (cnt) {
> >> +             buf = priv->rx_buf[priv->rx_head];
> >> +             skb = build_skb(buf, priv->rx_buf_size);
> >> +             if (unlikely(!skb))
> >> +                     net_dbg_ratelimited("build_skb failed\n");
> >> +             dma_map_single(&ndev->dev, skb->data,
> >> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
> >> +             memcpy(tmp, skb->data, 64);
> >> +             endian_change((void *)tmp, 64);
> >> +             desc = (struct rx_desc *)tmp;
> >> +             len = desc->pkt_len;
> >
> > The dma_map_single() seems misplaced here, for all I can tell, the
> > data has already been transferred. Maybe you mean dma_unmap_single?
> >
> > I don't see why you copy 64 bytes out of the buffer using endian_change,
> > rather than just looking at the first word, which seems to have the
> > only value you are interested in.
> Russell suggested using be16_to_cpu etc to replace memcpy.

Right, but doesn't it have to be be32_to_cpu?

> >> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
> >> +{
> >> +     struct hip04_priv *priv = netdev_priv(ndev);
> >> +     int i;
> >> +
> >> +     priv->rx_buf_size = RX_BUF_SIZE +
> >> +                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >> +
> >> +     priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
> >> +                             SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
> >> +     if (!priv->desc_pool)
> >> +             return -ENOMEM;
> >> +
> >> +     for (i = 0; i < TX_DESC_NUM; i++) {
> >> +             priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
> >> +                                     GFP_ATOMIC, &priv->td_phys[i]);
> >> +             if (!priv->td_ring[i])
> >> +                     return -ENOMEM;
> >> +     }
> >
> > Why do you create a dma pool here, when you do all the allocations upfront?
> >
> > It looks to me like you could simply turn the td_ring array of pointers
> > to tx descriptors into a an array of tx descriptors (no pointers) and allocate
> > that one using dma_alloc_coherent.
> 
> dma pool used here mainly because of alignment,
> the desc has requirement of SKB_DATA_ALIGN,
> so use the simplest way
>         priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>                                 SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);

dma_alloc_coherent() will actually give you PAGE_SIZE alignment, so that's
still easier.

> >
> >
> >> +     if (!ppebase) {
> >> +             struct device_node *n;
> >> +
> >> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
> >> +             if (!n) {
> >> +                     ret = -EINVAL;
> >> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
> >> +                     goto init_fail;
> >> +             }
> >> +             ppebase = of_iomap(n, 0);
> >> +     }
> >
> > How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
> > a more generic abstraction of the ppe, and stick the port and id in there as
> > well, e.g.
> >
> >         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id
> 
> To be honest, I am not familiar with syscon_regmap_lookup_by_phandle.
> 
> Florian has suggested
>               ppe: ppe@28c0000 {
>                         compatible = "hisilicon,hip04-ppe";
>                         reg = <0x28c0000 0x10000>;
>                         #address-cells = <1>;
>                         #size-cells = <0>;
> 
>                         eth0_port: port@1f {
>                                 reg = <31>;

		minor comment: I'd use 0x1f for the reg too.

>                         };
> 
>                         eth1_port: port@0 {
>                                 reg = <0>;
>                         };
> 
>                         eth2_port: port@8 {
>                                 reg = <8>;
>                         };
>                 };
>                 fe: ethernet@28b0000 {
>                         compatible = "hisilicon,hip04-mac";
>                         reg = <0x28b0000 0x10000>;
>                         interrupts = <0 413 4>;
>                         phy-mode = "mii";
>                         port-handle = <&eth0_port>;
>                 };
> And the port info can be found from port-handle
> n = of_parse_phandle(node, "port-handle", 0);
> ret = of_property_read_u32(n, "reg", &priv->port);
> 
> The id is the controller start channel in ppe, either use alias or
> another property in the port.

Yes, this seems fine as well, as long as you are sure that the ppe
device is only used by hip04 ethernet devices, I think it
would get messy if your hardware shares them with other units.

It's probably a little simpler to avoid the sub-nodes and instead do

>               ppe: ppe@28c0000 {
>                         compatible = "hisilicon,hip04-ppe";
>                         reg = <0x28c0000 0x10000>;
>                         #address-cells = <1>;
>                         #size-cells = <0>;
>                 };
>                 fe: ethernet@28b0000 {
>                         compatible = "hisilicon,hip04-mac";
>                         reg = <0x28b0000 0x10000>;
>                         interrupts = <0 413 4>;
>                         phy-mode = "mii";
>                         port-handle = <&ppe 31>;
>                 };

In the code, I would create a simple ppe driver that exports
a few functions. you need. In the ethernet driver probe() function,
you should get a handle to the ppe using

	/* look up "port-handle" property of the current device, find ppe and port */
	struct hip04_ppe *ppe = hip04_ppe_get(dev->of_node); 
	if (IS_ERR(ppe))
		return PTR_ERR(ptr); /* this handles -EPROBE_DEFER */

and then in other code you can just do

	hip04_ppe_set_foo(priv->ppe, foo_config);

This is a somewhat more high-level abstraction that syscon, which
just gives you a 'struct regmap' structure for doing register-level
configuration like you have today.

> >> +     ret = of_property_read_u32(node, "speed", &val);
> >> +     if (ret) {
> >> +             dev_warn(d, "not find speed info\n");
> >> +             priv->speed = SPEED_1000;
> >> +     }
> >> +
> >> +     if (SPEED_100 == val)
> >> +             priv->speed = SPEED_100;
> >> +     else
> >> +             priv->speed = SPEED_1000;
> >> +     priv->duplex = DUPLEX_FULL;
> >
> > Why do you even need the speed here, shouldn't you get that information
> > from the phy through hip04_adjust_link?
> 
> There still 100M mac does not have phy, as well adjust_link.
> Will use phy-mode instead of speed.

Ok.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 21, 2014, 5:19 a.m. UTC | #6
Dear Arnd

On Thu, Mar 20, 2014 at 10:31 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> > This also seems to encode knowledge about a particular implementation
>> > into the driver. Maybe it's better to add a property for the port
>> > mode?
>>
>> After check Documentation/devicetree/bindings/net/ethernet.txt,
>> I think phy-mode is more suitable here.
>>
>>         switch (priv->phy_mode) {
>>         case PHY_INTERFACE_MODE_SGMII:
>>                 if (speed == SPEED_1000)
>>                         val = 8;
>>                 else
>>                         val = 7;
>>                 break;
>>         case PHY_INTERFACE_MODE_MII:
>>                 val = 1;        /* SPEED_100 */
>>                 break;
>>         default:
>>                 val = 0;
>>                 break;
>>         }
>>         writel_relaxed(val, priv->base + GE_PORT_MODE);
>>
>> probe:
>> priv->phy_mode = of_get_phy_mode(node);
>
> Yes, this looks better, but where does 'speed' come from now? I assume
> even in SGMII mode, you should allow autonegotiation and set this correctly
> from the PHY code. Is that what you do here?

Yes, for the SGMII, there is the auto negotiation.
'speed' coming from adjust_link.
While the MII mode, I will directly set 100M at probe, since no auto
negotiation.

>
>> >> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
>> >> +{
>> >> +     writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
>> >> +}
>> >> +
>> >> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
>> >> +{
>> >> +     writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
>> >> +}
>> >> +
>> >> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
>> >> +{
>> >> +     return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
>> >> +}
>> >
>> > At the very least, the hip04_set_xmit_desc() function needs to use 'writel'
>> > rather than 'writel_relaxed'. Otherwise data that is being sent out
>> > can be stuck in the CPU's write buffers and you send stale data on the wire.
>> >
>> > For the receive path, you may or may not need to use 'readl', depending
>> > on how DMA is serialized by this device. If you have MSI interrupts, the
>> > interrupt message should already do the serialization, but if you have
>> > edge or level triggered interrupts, you normally need to have one readl()
>> > from the device register between the IRQ and the data access.
>>
>> Really, will update to readl/wirtel in xmit and hip04_rx_poll.
>> I just got impression, use *_relaxed as much as possible for better performance.
>
> Ok. The _relaxed() versions are really meant for people that understand
> the ordering requirements. The regular readl/writel accessors contain
> extra barriers to make them equivalent to what x86 does.

Thanks for the knowledge.
So generally readl/writel should be put in critical process, where we
want the immediate result,
like irq handler?

>
>> >> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
>> >> +{
>> >> +     struct hip04_priv *priv = container_of(napi,
>> >> +                           struct hip04_priv, napi);
>> >> +     struct net_device *ndev = priv->ndev;
>> >> +     struct sk_buff *skb;
>> >> +     struct rx_desc *desc;
>> >> +     unsigned char *buf;
>> >> +     int rx = 0;
>> >> +     unsigned int cnt = hip04_recv_cnt(priv);
>> >> +     unsigned int len, tmp[16];
>> >> +
>> >> +     while (cnt) {
>> >> +             buf = priv->rx_buf[priv->rx_head];
>> >> +             skb = build_skb(buf, priv->rx_buf_size);
>> >> +             if (unlikely(!skb))
>> >> +                     net_dbg_ratelimited("build_skb failed\n");
>> >> +             dma_map_single(&ndev->dev, skb->data,
>> >> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
>> >> +             memcpy(tmp, skb->data, 64);
>> >> +             endian_change((void *)tmp, 64);
>> >> +             desc = (struct rx_desc *)tmp;
>> >> +             len = desc->pkt_len;
>> >
>> > The dma_map_single() seems misplaced here, for all I can tell, the
>> > data has already been transferred. Maybe you mean dma_unmap_single?
>> >
>> > I don't see why you copy 64 bytes out of the buffer using endian_change,
>> > rather than just looking at the first word, which seems to have the
>> > only value you are interested in.
>> Russell suggested using be16_to_cpu etc to replace memcpy.
>
> Right, but doesn't it have to be be32_to_cpu?

The reason I got frustrated before is there are u16 and u32 in the structure.
While Russell reminded changing the structure layout.
So switching u16 & be16_to_cpu, and using be32_to_cpu for u32 works out.

>
>> >> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
>> >> +{
>> >> +     struct hip04_priv *priv = netdev_priv(ndev);
>> >> +     int i;
>> >> +
>> >> +     priv->rx_buf_size = RX_BUF_SIZE +
>> >> +                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> >> +
>> >> +     priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>> >> +                             SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
>> >> +     if (!priv->desc_pool)
>> >> +             return -ENOMEM;
>> >> +
>> >> +     for (i = 0; i < TX_DESC_NUM; i++) {
>> >> +             priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
>> >> +                                     GFP_ATOMIC, &priv->td_phys[i]);
>> >> +             if (!priv->td_ring[i])
>> >> +                     return -ENOMEM;
>> >> +     }
>> >
>> > Why do you create a dma pool here, when you do all the allocations upfront?
>> >
>> > It looks to me like you could simply turn the td_ring array of pointers
>> > to tx descriptors into a an array of tx descriptors (no pointers) and allocate
>> > that one using dma_alloc_coherent.
>>
>> dma pool used here mainly because of alignment,
>> the desc has requirement of SKB_DATA_ALIGN,
>> so use the simplest way
>>         priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>>                                 SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
>
> dma_alloc_coherent() will actually give you PAGE_SIZE alignment, so that's
> still easier.
However since the alignment requirement, it can not simply use desc[i]
to get each desc.
desc = dma_alloc_coherent(d, size, &phys, GFP_KERNEL);
desc[i] is not what we want.
So still prefer using dma_pool_alloc here.
>
>> >
>> >
>> >> +     if (!ppebase) {
>> >> +             struct device_node *n;
>> >> +
>> >> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
>> >> +             if (!n) {
>> >> +                     ret = -EINVAL;
>> >> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
>> >> +                     goto init_fail;
>> >> +             }
>> >> +             ppebase = of_iomap(n, 0);
>> >> +     }
>> >
>> > How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
>> > a more generic abstraction of the ppe, and stick the port and id in there as
>> > well, e.g.
>> >
>> >         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id
>>
>> To be honest, I am not familiar with syscon_regmap_lookup_by_phandle.
>>
>> Florian has suggested
>>               ppe: ppe@28c0000 {
>>                         compatible = "hisilicon,hip04-ppe";
>>                         reg = <0x28c0000 0x10000>;
>>                         #address-cells = <1>;
>>                         #size-cells = <0>;
>>
>>                         eth0_port: port@1f {
>>                                 reg = <31>;
>
>                 minor comment: I'd use 0x1f for the reg too.
>
>>                         };
>>
>>                         eth1_port: port@0 {
>>                                 reg = <0>;
>>                         };
>>
>>                         eth2_port: port@8 {
>>                                 reg = <8>;
>>                         };
>>                 };
>>                 fe: ethernet@28b0000 {
>>                         compatible = "hisilicon,hip04-mac";
>>                         reg = <0x28b0000 0x10000>;
>>                         interrupts = <0 413 4>;
>>                         phy-mode = "mii";
>>                         port-handle = <&eth0_port>;
>>                 };
>> And the port info can be found from port-handle
>> n = of_parse_phandle(node, "port-handle", 0);
>> ret = of_property_read_u32(n, "reg", &priv->port);
>>
>> The id is the controller start channel in ppe, either use alias or
>> another property in the port.
>
> Yes, this seems fine as well, as long as you are sure that the ppe
> device is only used by hip04 ethernet devices, I think it
> would get messy if your hardware shares them with other units.

Checked syscon_regmap, looks it specially useful for the accessing
common registers.
Here ppe is ethernet "accelerator", used specifically in this driver.
Since here only have several places access the ppebase, it maybe
simpler access directly.

>
> It's probably a little simpler to avoid the sub-nodes and instead do
>
>>               ppe: ppe@28c0000 {
>>                         compatible = "hisilicon,hip04-ppe";
>>                         reg = <0x28c0000 0x10000>;
>>                         #address-cells = <1>;
>>                         #size-cells = <0>;
>>                 };
>>                 fe: ethernet@28b0000 {
>>                         compatible = "hisilicon,hip04-mac";
>>                         reg = <0x28b0000 0x10000>;
>>                         interrupts = <0 413 4>;
>>                         phy-mode = "mii";
>>                         port-handle = <&ppe 31>;
>>                 };
>
> In the code, I would create a simple ppe driver that exports
> a few functions. you need. In the ethernet driver probe() function,
> you should get a handle to the ppe using
>
>         /* look up "port-handle" property of the current device, find ppe and port */
>         struct hip04_ppe *ppe = hip04_ppe_get(dev->of_node);
>         if (IS_ERR(ppe))
>                 return PTR_ERR(ptr); /* this handles -EPROBE_DEFER */
>
> and then in other code you can just do
>
>         hip04_ppe_set_foo(priv->ppe, foo_config);
>
> This is a somewhat more high-level abstraction that syscon, which
> just gives you a 'struct regmap' structure for doing register-level
> configuration like you have today.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann March 21, 2014, 7:37 a.m. UTC | #7
On Friday 21 March 2014 13:19:07 Zhangfei Gao wrote:
> On Thu, Mar 20, 2014 at 10:31 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>
> > Yes, this looks better, but where does 'speed' come from now? I assume
> > even in SGMII mode, you should allow autonegotiation and set this correctly
> > from the PHY code. Is that what you do here?
> 
> Yes, for the SGMII, there is the auto negotiation.
> 'speed' coming from adjust_link.
> While the MII mode, I will directly set 100M at probe, since no auto
> negotiation.

Ok, good.

> >
> >> >> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
> >> >> +{
> >> >> +     writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
> >> >> +}
> >> >> +
> >> >> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
> >> >> +{
> >> >> +     writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
> >> >> +}
> >> >> +
> >> >> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
> >> >> +{
> >> >> +     return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
> >> >> +}
> >> >
> >> > At the very least, the hip04_set_xmit_desc() function needs to use 'writel'
> >> > rather than 'writel_relaxed'. Otherwise data that is being sent out
> >> > can be stuck in the CPU's write buffers and you send stale data on the wire.
> >> >
> >> > For the receive path, you may or may not need to use 'readl', depending
> >> > on how DMA is serialized by this device. If you have MSI interrupts, the
> >> > interrupt message should already do the serialization, but if you have
> >> > edge or level triggered interrupts, you normally need to have one readl()
> >> > from the device register between the IRQ and the data access.
> >>
> >> Really, will update to readl/wirtel in xmit and hip04_rx_poll.
> >> I just got impression, use *_relaxed as much as possible for better performance.
> >
> > Ok. The _relaxed() versions are really meant for people that understand
> > the ordering requirements. The regular readl/writel accessors contain
> > extra barriers to make them equivalent to what x86 does.
> 
> Thanks for the knowledge.
> So generally readl/writel should be put in critical process, where we
> want the immediate result,
> like irq handler?

Strictly speaking you only need one writel after preparing an
outbound DMA buffer, and and one readl before reading an inbound
DMA buffer.

> >> >> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
> >> >> +{
> >> >> +     struct hip04_priv *priv = container_of(napi,
> >> >> +                           struct hip04_priv, napi);
> >> >> +     struct net_device *ndev = priv->ndev;
> >> >> +     struct sk_buff *skb;
> >> >> +     struct rx_desc *desc;
> >> >> +     unsigned char *buf;
> >> >> +     int rx = 0;
> >> >> +     unsigned int cnt = hip04_recv_cnt(priv);
> >> >> +     unsigned int len, tmp[16];
> >> >> +
> >> >> +     while (cnt) {
> >> >> +             buf = priv->rx_buf[priv->rx_head];
> >> >> +             skb = build_skb(buf, priv->rx_buf_size);
> >> >> +             if (unlikely(!skb))
> >> >> +                     net_dbg_ratelimited("build_skb failed\n");
> >> >> +             dma_map_single(&ndev->dev, skb->data,
> >> >> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
> >> >> +             memcpy(tmp, skb->data, 64);
> >> >> +             endian_change((void *)tmp, 64);
> >> >> +             desc = (struct rx_desc *)tmp;
> >> >> +             len = desc->pkt_len;
> >> >
> >> > The dma_map_single() seems misplaced here, for all I can tell, the
> >> > data has already been transferred. Maybe you mean dma_unmap_single?
> >> >
> >> > I don't see why you copy 64 bytes out of the buffer using endian_change,
> >> > rather than just looking at the first word, which seems to have the
> >> > only value you are interested in.
> >> Russell suggested using be16_to_cpu etc to replace memcpy.
> >
> > Right, but doesn't it have to be be32_to_cpu?
> 
> The reason I got frustrated before is there are u16 and u32 in the structure.
> While Russell reminded changing the structure layout.
> So switching u16 & be16_to_cpu, and using be32_to_cpu for u32 works out.

Ok. The endian_change() function did multiple 32-bit swaps, so I assumed
that was what the design required. Swapping each field individually with
its actual length is more sensible though, so if that works, it is probably
the correct solution and the endian_change() function was in fact wrong.

> >> >> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
> >> >> +{
> >> >> +     struct hip04_priv *priv = netdev_priv(ndev);
> >> >> +     int i;
> >> >> +
> >> >> +     priv->rx_buf_size = RX_BUF_SIZE +
> >> >> +                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >> >> +
> >> >> +     priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
> >> >> +                             SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
> >> >> +     if (!priv->desc_pool)
> >> >> +             return -ENOMEM;
> >> >> +
> >> >> +     for (i = 0; i < TX_DESC_NUM; i++) {
> >> >> +             priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
> >> >> +                                     GFP_ATOMIC, &priv->td_phys[i]);
> >> >> +             if (!priv->td_ring[i])
> >> >> +                     return -ENOMEM;
> >> >> +     }
> >> >
> >> > Why do you create a dma pool here, when you do all the allocations upfront?
> >> >
> >> > It looks to me like you could simply turn the td_ring array of pointers
> >> > to tx descriptors into a an array of tx descriptors (no pointers) and allocate
> >> > that one using dma_alloc_coherent.
> >>
> >> dma pool used here mainly because of alignment,
> >> the desc has requirement of SKB_DATA_ALIGN,
> >> so use the simplest way
> >>         priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
> >>                                 SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
> >
> > dma_alloc_coherent() will actually give you PAGE_SIZE alignment, so that's
> > still easier.
> However since the alignment requirement, it can not simply use desc[i]
> to get each desc.
> desc = dma_alloc_coherent(d, size, &phys, GFP_KERNEL);
> desc[i] is not what we want.
> So still prefer using dma_pool_alloc here.

Ah, I see what you mean: struct tx_desc is actually smaller than the
required alignment. You can fix that by marking the structure
"____cacheline_aligned". 

> >>                         };
> >>
> >>                         eth1_port: port@0 {
> >>                                 reg = <0>;
> >>                         };
> >>
> >>                         eth2_port: port@8 {
> >>                                 reg = <8>;
> >>                         };
> >>                 };
> >>                 fe: ethernet@28b0000 {
> >>                         compatible = "hisilicon,hip04-mac";
> >>                         reg = <0x28b0000 0x10000>;
> >>                         interrupts = <0 413 4>;
> >>                         phy-mode = "mii";
> >>                         port-handle = <&eth0_port>;
> >>                 };
> >> And the port info can be found from port-handle
> >> n = of_parse_phandle(node, "port-handle", 0);
> >> ret = of_property_read_u32(n, "reg", &priv->port);
> >>
> >> The id is the controller start channel in ppe, either use alias or
> >> another property in the port.
> >
> > Yes, this seems fine as well, as long as you are sure that the ppe
> > device is only used by hip04 ethernet devices, I think it
> > would get messy if your hardware shares them with other units.
> 
> Checked syscon_regmap, looks it specially useful for the accessing
> common registers.
> Here ppe is ethernet "accelerator", used specifically in this driver.
> Since here only have several places access the ppebase, it maybe
> simpler access directly.

Right. I was just bringing it up because some similar designs
(e.g. TI keystone or APM x-gene) share an accelarator between
multiple users, not just the ethernet devices, and in that case
you might need a more general solution.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 21, 2014, 7:56 a.m. UTC | #8
Dear Arnd

On Fri, Mar 21, 2014 at 3:37 PM, Arnd Bergmann <arnd@arndb.de> wrote:

>> >> >> +static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
>> >> >> +{
>> >> >> +     struct hip04_priv *priv = netdev_priv(ndev);
>> >> >> +     int i;
>> >> >> +
>> >> >> +     priv->rx_buf_size = RX_BUF_SIZE +
>> >> >> +                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> >> >> +
>> >> >> +     priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>> >> >> +                             SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
>> >> >> +     if (!priv->desc_pool)
>> >> >> +             return -ENOMEM;
>> >> >> +
>> >> >> +     for (i = 0; i < TX_DESC_NUM; i++) {
>> >> >> +             priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
>> >> >> +                                     GFP_ATOMIC, &priv->td_phys[i]);
>> >> >> +             if (!priv->td_ring[i])
>> >> >> +                     return -ENOMEM;
>> >> >> +     }
>> >> >
>> >> > Why do you create a dma pool here, when you do all the allocations upfront?
>> >> >
>> >> > It looks to me like you could simply turn the td_ring array of pointers
>> >> > to tx descriptors into a an array of tx descriptors (no pointers) and allocate
>> >> > that one using dma_alloc_coherent.
>> >>
>> >> dma pool used here mainly because of alignment,
>> >> the desc has requirement of SKB_DATA_ALIGN,
>> >> so use the simplest way
>> >>         priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
>> >>                                 SKB_DATA_ALIGN(sizeof(struct tx_desc)), 0);
>> >
>> > dma_alloc_coherent() will actually give you PAGE_SIZE alignment, so that's
>> > still easier.
>> However since the alignment requirement, it can not simply use desc[i]
>> to get each desc.
>> desc = dma_alloc_coherent(d, size, &phys, GFP_KERNEL);
>> desc[i] is not what we want.
>> So still prefer using dma_pool_alloc here.
>
> Ah, I see what you mean: struct tx_desc is actually smaller than the
> required alignment. You can fix that by marking the structure
> "____cacheline_aligned".
>

Yes, after recheck the method carefully, it works with __aligned(0x40).
"____cacheline_aligned" is much better.
The desc can be get with either table or pointer.

Will update accordingly, it is simpler.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 24, 2014, 8:17 a.m. UTC | #9
Dear Arnd

On Thu, Mar 20, 2014 at 10:31 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 20 March 2014, Zhangfei Gao wrote:
>> On Tue, Mar 18, 2014 at 7:25 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> > On Tuesday 18 March 2014 16:40:17 Zhangfei Gao wrote:
>> >
>> >> +
>> >> +static void __iomem *ppebase;
>> >
>> > The global 'ppebase' seems hacky. Isn't that a SoC-specific register area, while
>> > the rest of the driver is reusable across SoCs?
>> >
>> > What does 'ppe' stand for?
>> >
>> > What if there are multiple instances of this, which each have their own ppebase?
>>
>> In this specific platform,
>> ppe is the only module controlling all the fifos for all the net controller.
>> And each controller connect to specific port.
>> ppe has 2048 channels, sharing by all the controller, only if not overlapped.
>>
>> So the static ppebase is required, which I don't like too.
>> Two inputs required, one is port, which is connect to the controller.
>> The other is start channel, currently I use id, and start channel is
>> RX_DESC_NUM * priv->id;  /* start_addr */
>
> Ok, thanks for the explanation!
>
I thought you are fine with "static void __iomem *ppebase" here.

>> >
>> >> +     if (!ppebase) {
>> >> +             struct device_node *n;
>> >> +
>> >> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
>> >> +             if (!n) {
>> >> +                     ret = -EINVAL;
>> >> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
>> >> +                     goto init_fail;
>> >> +             }
>> >> +             ppebase = of_iomap(n, 0);
>> >> +     }
>> >
>> > How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
>> > a more generic abstraction of the ppe, and stick the port and id in there as
>> > well, e.g.
>> >
>> >         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id

Even if using syscon_regmap_lookup_by_phandle, there still have static
struct regmap, since three controllers
share one regmap.

> It's probably a little simpler to avoid the sub-nodes and instead do
>
>>               ppe: ppe@28c0000 {
>>                         compatible = "hisilicon,hip04-ppe";
>>                         reg = <0x28c0000 0x10000>;
>>                         #address-cells = <1>;
>>                         #size-cells = <0>;
>>                 };
>>                 fe: ethernet@28b0000 {
>>                         compatible = "hisilicon,hip04-mac";
>>                         reg = <0x28b0000 0x10000>;
>>                         interrupts = <0 413 4>;
>>                         phy-mode = "mii";
>>                         port-handle = <&ppe 31>;
>>                 };
>
> In the code, I would create a simple ppe driver that exports
> a few functions. you need. In the ethernet driver probe() function,
> you should get a handle to the ppe using
>
>         /* look up "port-handle" property of the current device, find ppe and port */
>         struct hip04_ppe *ppe = hip04_ppe_get(dev->of_node);
>         if (IS_ERR(ppe))
>                 return PTR_ERR(ptr); /* this handles -EPROBE_DEFER */
>
> and then in other code you can just do
>
>         hip04_ppe_set_foo(priv->ppe, foo_config);
>
> This is a somewhat more high-level abstraction that syscon, which
> just gives you a 'struct regmap' structure for doing register-level
> configuration like you have today.
>

Do you mean create one additional file like ppe.c with some exported
function to remove the static ppebase?
Since the ppe is specifically bounded with ethernet, and does not used
anywhere else,
the exported function may not be used anywhere else.
Is it make it more complicated since there are probe, remove etc.

So I may still prefer using "static void __iomem *ppebase", as it is simpler.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann March 24, 2014, 10:02 a.m. UTC | #10
On Monday 24 March 2014 16:17:42 Zhangfei Gao wrote:
> On Thu, Mar 20, 2014 at 10:31 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> >> >
> >> >> +     if (!ppebase) {
> >> >> +             struct device_node *n;
> >> >> +
> >> >> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
> >> >> +             if (!n) {
> >> >> +                     ret = -EINVAL;
> >> >> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
> >> >> +                     goto init_fail;
> >> >> +             }
> >> >> +             ppebase = of_iomap(n, 0);
> >> >> +     }
> >> >
> >> > How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
> >> > a more generic abstraction of the ppe, and stick the port and id in there as
> >> > well, e.g.
> >> >
> >> >         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id
> 
> Even if using syscon_regmap_lookup_by_phandle, there still have static
> struct regmap, since three controllers
> share one regmap.

The regmap is then owned by the syscon driver, while each controller takes
a reference to the regmap that it can store in its own private data
structure. However, as we discussed using a ppe driver sounds nicer than
regmap.

> > It's probably a little simpler to avoid the sub-nodes and instead do
> >
> >>               ppe: ppe@28c0000 {
> >>                         compatible = "hisilicon,hip04-ppe";
> >>                         reg = <0x28c0000 0x10000>;
> >>                         #address-cells = <1>;
> >>                         #size-cells = <0>;
> >>                 };
> >>                 fe: ethernet@28b0000 {
> >>                         compatible = "hisilicon,hip04-mac";
> >>                         reg = <0x28b0000 0x10000>;
> >>                         interrupts = <0 413 4>;
> >>                         phy-mode = "mii";
> >>                         port-handle = <&ppe 31>;
> >>                 };
> >
> > In the code, I would create a simple ppe driver that exports
> > a few functions. you need. In the ethernet driver probe() function,
> > you should get a handle to the ppe using
> >
> >         /* look up "port-handle" property of the current device, find ppe and port */
> >         struct hip04_ppe *ppe = hip04_ppe_get(dev->of_node);
> >         if (IS_ERR(ppe))
> >                 return PTR_ERR(ptr); /* this handles -EPROBE_DEFER */
> >
> > and then in other code you can just do
> >
> >         hip04_ppe_set_foo(priv->ppe, foo_config);
> >
> > This is a somewhat more high-level abstraction that syscon, which
> > just gives you a 'struct regmap' structure for doing register-level
> > configuration like you have today.
> >
> 
> Do you mean create one additional file like ppe.c with some exported
> function to remove the static ppebase?

It doesn't have to be a separate file, as long as you register a
separate platform_driver for the ppe.

> Since the ppe is specifically bounded with ethernet, and does not used
> anywhere else,
> the exported function may not be used anywhere else.
> Is it make it more complicated since there are probe, remove etc.
> 
> So I may still prefer using "static void __iomem *ppebase", as it is simpler.

The trouble is that the driver should not rely on being only there
for a single instance, that's not how we write drivers.

I'm fine with either a syscon instance (which would be simpler) or a
separate ppe driver as part of the hip04-mac driver (which would be
a nicer abstraction).

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 24, 2014, 1:23 p.m. UTC | #11
On Mon, Mar 24, 2014 at 6:02 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Monday 24 March 2014 16:17:42 Zhangfei Gao wrote:
>> On Thu, Mar 20, 2014 at 10:31 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> >> >
>> >> >> +     if (!ppebase) {
>> >> >> +             struct device_node *n;
>> >> >> +
>> >> >> +             n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
>> >> >> +             if (!n) {
>> >> >> +                     ret = -EINVAL;
>> >> >> +                     netdev_err(ndev, "not find hisilicon,ppebase\n");
>> >> >> +                     goto init_fail;
>> >> >> +             }
>> >> >> +             ppebase = of_iomap(n, 0);
>> >> >> +     }
>> >> >
>> >> > How about using syscon_regmap_lookup_by_phandle() here? That way, you can have
>> >> > a more generic abstraction of the ppe, and stick the port and id in there as
>> >> > well, e.g.
>> >> >
>> >> >         ppe-syscon = <&hip04ppe 1 4>; // ppe, port, id
>>
>> Even if using syscon_regmap_lookup_by_phandle, there still have static
>> struct regmap, since three controllers
>> share one regmap.
>
> The regmap is then owned by the syscon driver, while each controller takes
> a reference to the regmap that it can store in its own private data
> structure. However, as we discussed using a ppe driver sounds nicer than
> regmap.
>
>> > It's probably a little simpler to avoid the sub-nodes and instead do
>> >
>> >>               ppe: ppe@28c0000 {
>> >>                         compatible = "hisilicon,hip04-ppe";
>> >>                         reg = <0x28c0000 0x10000>;
>> >>                         #address-cells = <1>;
>> >>                         #size-cells = <0>;
>> >>                 };
>> >>                 fe: ethernet@28b0000 {
>> >>                         compatible = "hisilicon,hip04-mac";
>> >>                         reg = <0x28b0000 0x10000>;
>> >>                         interrupts = <0 413 4>;
>> >>                         phy-mode = "mii";
>> >>                         port-handle = <&ppe 31>;
>> >>                 };
>> >
>> > In the code, I would create a simple ppe driver that exports
>> > a few functions. you need. In the ethernet driver probe() function,
>> > you should get a handle to the ppe using
>> >
>> >         /* look up "port-handle" property of the current device, find ppe and port */
>> >         struct hip04_ppe *ppe = hip04_ppe_get(dev->of_node);
>> >         if (IS_ERR(ppe))
>> >                 return PTR_ERR(ptr); /* this handles -EPROBE_DEFER */
>> >
>> > and then in other code you can just do
>> >
>> >         hip04_ppe_set_foo(priv->ppe, foo_config);
>> >
>> > This is a somewhat more high-level abstraction that syscon, which
>> > just gives you a 'struct regmap' structure for doing register-level
>> > configuration like you have today.
>> >
>>
>> Do you mean create one additional file like ppe.c with some exported
>> function to remove the static ppebase?
>
> It doesn't have to be a separate file, as long as you register a
> separate platform_driver for the ppe.
>
>> Since the ppe is specifically bounded with ethernet, and does not used
>> anywhere else,
>> the exported function may not be used anywhere else.
>> Is it make it more complicated since there are probe, remove etc.
>>
>> So I may still prefer using "static void __iomem *ppebase", as it is simpler.
>
> The trouble is that the driver should not rely on being only there
> for a single instance, that's not how we write drivers.
>
> I'm fine with either a syscon instance (which would be simpler) or a
> separate ppe driver as part of the hip04-mac driver (which would be
> a nicer abstraction).
>

Understand now.
Will update with syscon, as it is simpler.
Thanks for your patience.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Herring March 24, 2014, 2:17 p.m. UTC | #12
On Thu, Mar 20, 2014 at 4:51 AM, Zhangfei Gao <zhangfei.gao@gmail.com> wrote:
> Dear Russell
>
> Thanks for sparing time and giving so many perfect suggestion, really helpful.
>
> On Tue, Mar 18, 2014 at 6:46 PM, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
>> I was just browsing this patch when I noticed some of these issues - I
>> haven't done a full review of this driver, I'm just commenting on the
>> things I've spotted.

[snip]

>>> +             dma_map_single(&ndev->dev, skb->data,
>>> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
>>
>> This is incorrect.
>>
>> buf = buffer alloc()
>> /* CPU owns buffer and can read/write it, device does not */
>> dev_addr = dma_map_single(dev, buf, ..., DMA_FROM_DEVICE);
>> /* Device owns buffer and can write it, CPU does not access it */
>> dma_unmap_single(dev, dev_addr, ..., DMA_FROM_DEVICE);
>> /* CPU owns buffer again and can read/write it, device does not */
>>
>> Please turn on DMA API debugging in the kernel debug options and verify
>> whether your driver causes it to complain (it will.)
>
> Yes, you are right.
> After change to dma_map/unmap_single, however, still get warning like
> "DMA-API: device driver failed to check map error", not sure whether
> it can be ignored?

If it could be ignored, there would be no warning. So yes you should
check the error. I guess correct error handling would be throwing away
the packet.


>>> +             dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
>>> +             hip04_set_recv_desc(priv, virt_to_phys(buf));
>>
>> No need for virt_to_phys() here - dma_map_single() returns the device
>> address.
> Got it.
> Use virt_to_phys since find same result come out, it should be
> different for iommu case.
>
> In fact, the hardware can help to do the cache flushing, the function
> still not be enabled now.
> Then dma_map/unmap_single may be ignored.

If you don't need cache flushing, you should setup different
dma_map_ops for the device such as arm_coherent_dma_ops. The driver
should always have the dma_map calls. See highbank and mvebu for
examples.

Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao March 26, 2014, 2:22 p.m. UTC | #13
Dear Rob

On Mon, Mar 24, 2014 at 10:17 PM, Rob Herring <robherring2@gmail.com> wrote:

>
>>>> +             dma_map_single(&ndev->dev, skb->data,
>>>> +                     RX_BUF_SIZE, DMA_FROM_DEVICE);
>>>
>>> This is incorrect.
>>>
>>> buf = buffer alloc()
>>> /* CPU owns buffer and can read/write it, device does not */
>>> dev_addr = dma_map_single(dev, buf, ..., DMA_FROM_DEVICE);
>>> /* Device owns buffer and can write it, CPU does not access it */
>>> dma_unmap_single(dev, dev_addr, ..., DMA_FROM_DEVICE);
>>> /* CPU owns buffer again and can read/write it, device does not */
>>>
>>> Please turn on DMA API debugging in the kernel debug options and verify
>>> whether your driver causes it to complain (it will.)
>>
>> Yes, you are right.
>> After change to dma_map/unmap_single, however, still get warning like
>> "DMA-API: device driver failed to check map error", not sure whether
>> it can be ignored?
>
> If it could be ignored, there would be no warning. So yes you should
> check the error. I guess correct error handling would be throwing away
> the packet.

The warning is solved via adding dma_mapping_error check after every
dma_map_single.

>
>
>>>> +             dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
>>>> +             hip04_set_recv_desc(priv, virt_to_phys(buf));
>>>
>>> No need for virt_to_phys() here - dma_map_single() returns the device
>>> address.
>> Got it.
>> Use virt_to_phys since find same result come out, it should be
>> different for iommu case.
>>
>> In fact, the hardware can help to do the cache flushing, the function
>> still not be enabled now.
>> Then dma_map/unmap_single may be ignored.
>
> If you don't need cache flushing, you should setup different
> dma_map_ops for the device such as arm_coherent_dma_ops. The driver
> should always have the dma_map calls. See highbank and mvebu for
> examples.

That would be very helpful, still not sure how to enable this feature.
Could you help clarify which file.
Not find highbank and mvebu under drivers/iommu/.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/hisilicon/Makefile b/drivers/net/ethernet/hisilicon/Makefile
index 1d6eb6e..e6fe7af 100644
--- a/drivers/net/ethernet/hisilicon/Makefile
+++ b/drivers/net/ethernet/hisilicon/Makefile
@@ -2,4 +2,4 @@ 
 # Makefile for the HISILICON network device drivers.
 #
 
-obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o
+obj-$(CONFIG_HIP04_ETH) += hip04_eth.o hip04_mdio.o
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
new file mode 100644
index 0000000..b12e0df
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -0,0 +1,717 @@ 
+
+/* Copyright (c) 2014 Linaro Ltd.
+ * Copyright (c) 2014 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/of_address.h>
+#include <linux/dmapool.h>
+#include <linux/phy.h>
+#include <linux/of_mdio.h>
+
+#define PPE_CFG_RX_CFF_ADDR		0x100
+#define PPE_CFG_POOL_GRP		0x300
+#define PPE_CFG_RX_BUF_SIZE		0x400
+#define PPE_CFG_RX_FIFO_SIZE		0x500
+#define PPE_CURR_BUF_CNT_REG		0xa200
+
+#define GE_DUPLEX_TYPE			0x8
+#define GE_MAX_FRM_SIZE_REG		0x3c
+#define GE_PORT_MODE			0x40
+#define GE_PORT_EN			0x44
+#define GE_SHORT_RUNTS_THR_REG		0x50
+#define GE_TX_LOCAL_PAGE_REG		0x5c
+#define GE_TRANSMIT_CONTROL_REG		0x60
+#define GE_CF_CRC_STRIP_REG		0x1b0
+#define GE_MODE_CHANGE_EN		0x1b4
+#define GE_RECV_CONTROL_REG		0x1e0
+#define GE_STATION_MAC_ADDRESS		0x210
+#define PPE_CFG_TX_PKT_BD_ADDR		0x420
+#define PPE_CFG_MAX_FRAME_LEN_REG	0x408
+#define PPE_CFG_BUS_CTRL_REG		0x424
+#define PPE_CFG_RX_CTRL_REG		0x428
+#define PPE_CFG_RX_PKT_MODE_REG		0x438
+#define PPE_CFG_QOS_VMID_GEN		0x500
+#define PPE_CFG_RX_PKT_INT		0x538
+#define PPE_INTEN			0x600
+#define PPE_INTSTS			0x608
+#define PPE_RINT			0x604
+#define PPE_CFG_STS_MODE		0x700
+#define PPE_HIS_RX_PKT_CNT		0x804
+
+/* REG_INTERRUPT */
+#define RCV_INT				BIT(10)
+#define RCV_NOBUF			BIT(8)
+#define DEF_INT_MASK			0x41fdf
+
+#define RX_DESC_NUM			64
+#define TX_DESC_NUM			64
+#define TX_NEXT(N)			(((N) + 1) & (TX_DESC_NUM-1))
+#define RX_NEXT(N)			(((N) + 1) & (RX_DESC_NUM-1))
+
+#define GMAC_PPE_RX_PKT_MAX_LEN		379
+#define GMAC_MAX_PKT_LEN		1516
+#define DESC_DEF_CFG			0x14
+#define RX_BUF_SIZE			1600
+#define TX_TIMEOUT			(6 * HZ)
+
+#define DRV_NAME			"hip04-ether"
+
+struct tx_desc {
+	u32 send_addr;
+	u16 send_size;
+	u16 reserved[3];
+	u32 cfg;
+	u32 wb_addr;
+};
+
+struct rx_desc {
+	u16 pkt_len;
+	u16 reserved_16;
+	u32 reserve[8];
+};
+
+struct hip04_priv {
+	void __iomem *base;
+	unsigned int port;
+	unsigned int speed;
+	unsigned int duplex;
+	unsigned int id;
+	unsigned int reg_inten;
+
+	struct napi_struct napi;
+	struct net_device *ndev;
+	struct dma_pool *desc_pool;
+
+	struct sk_buff *tx_skb[TX_DESC_NUM];
+	struct tx_desc *td_ring[TX_DESC_NUM];
+	dma_addr_t td_phys[TX_DESC_NUM];
+	spinlock_t txlock;
+	unsigned int tx_head;
+	unsigned int tx_tail;
+	unsigned int tx_count;
+
+	unsigned char *rx_buf[RX_DESC_NUM];
+	unsigned int rx_head;
+	unsigned int rx_buf_size;
+
+	struct device_node *phy_node;
+	struct phy_device *phy;
+};
+
+static void __iomem *ppebase;
+
+static void hip04_config_port(struct hip04_priv *priv, u32 speed, u32 duplex)
+{
+	u32 val;
+
+	priv->speed = speed;
+	priv->duplex = duplex;
+
+	switch (speed) {
+	case SPEED_1000:
+		val = 8;
+		break;
+	case SPEED_100:
+		if (priv->id)
+			val = 7;
+		else
+			val = 1;
+		break;
+	default:
+		val = 0;
+		break;
+	}
+	writel_relaxed(val, priv->base + GE_PORT_MODE);
+
+	val = (duplex) ? BIT(0) : 0;
+	writel_relaxed(val, priv->base + GE_DUPLEX_TYPE);
+
+	val = BIT(0);
+	writel_relaxed(val, priv->base + GE_MODE_CHANGE_EN);
+}
+
+static void hip04_reset_ppe(struct hip04_priv *priv)
+{
+	u32 val;
+
+	do {
+		val =
+		readl_relaxed(ppebase + priv->port * 4 + PPE_CURR_BUF_CNT_REG);
+		readl_relaxed(ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
+	} while (val & 0xfff);
+}
+
+static void hip04_config_fifo(struct hip04_priv *priv)
+{
+	u32 val;
+
+	val = readl_relaxed(priv->base + PPE_CFG_STS_MODE);
+	val |= BIT(12);			/* PPE_HIS_RX_PKT_CNT read clear */
+	writel_relaxed(val, priv->base + PPE_CFG_STS_MODE);
+
+	val = BIT(priv->port);
+	writel_relaxed(val, ppebase + priv->port * 4 + PPE_CFG_POOL_GRP);
+
+	val = priv->port << 8;
+	val |= BIT(14);
+	writel_relaxed(val, priv->base + PPE_CFG_QOS_VMID_GEN);
+
+	val = RX_BUF_SIZE;
+	writel_relaxed(val, ppebase + priv->port * 4 + PPE_CFG_RX_BUF_SIZE);
+
+	val = RX_DESC_NUM << 16;	/* depth */
+	val |= BIT(11);			/* seq: first set first ues */
+	val |= RX_DESC_NUM * priv->id;	/* start_addr */
+	writel_relaxed(val, ppebase + priv->port * 4 + PPE_CFG_RX_FIFO_SIZE);
+
+	/* pkt store format */
+	val = NET_IP_ALIGN << 11;	/* align */
+	writel_relaxed(val, priv->base + PPE_CFG_RX_CTRL_REG);
+
+	/* following cfg required for 1000M */
+	/* pkt mode */
+	val = BIT(18);			/* align */
+	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_MODE_REG);
+
+	/* set bus ctrl */
+	val = BIT(14);			/* buffer locally release */
+	val |= BIT(0);			/* big endian */
+	writel_relaxed(val, priv->base + PPE_CFG_BUS_CTRL_REG);
+
+	/* set max pkt len, curtail if exceed */
+	val = GMAC_PPE_RX_PKT_MAX_LEN;	/* max buffer len */
+	writel_relaxed(val, priv->base + PPE_CFG_MAX_FRAME_LEN_REG);
+
+	/* set max len of each pkt */
+	val = GMAC_MAX_PKT_LEN;		/* max buffer len */
+	writel_relaxed(val, priv->base + GE_MAX_FRM_SIZE_REG);
+
+	/* set min len of each pkt */
+	val = 31;			/* min buffer len */
+	writel_relaxed(val, priv->base + GE_SHORT_RUNTS_THR_REG);
+
+	/* tx */
+	val = readl_relaxed(priv->base + GE_TRANSMIT_CONTROL_REG);
+	val |= BIT(5);			/* tx auto neg */
+	val |= BIT(6);			/* tx add crc */
+	val |= BIT(7);			/* tx short pad through */
+	writel_relaxed(val, priv->base + GE_TRANSMIT_CONTROL_REG);
+
+	/* rx crc */
+	val = BIT(0);			/* rx strip crc */
+	writel_relaxed(val, priv->base + GE_CF_CRC_STRIP_REG);
+
+	/* rx */
+	val = readl_relaxed(priv->base + GE_RECV_CONTROL_REG);
+	val |= BIT(3);			/* rx strip pad */
+	val |= BIT(4);			/* run pkt en */
+	writel_relaxed(val, priv->base + GE_RECV_CONTROL_REG);
+
+	/* auto neg control */
+	val = BIT(0);
+	writel_relaxed(val, priv->base + GE_TX_LOCAL_PAGE_REG);
+}
+
+static void hip04_mac_enable(struct net_device *ndev, bool enable)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	if (enable) {
+		/* enable tx & rx */
+		val = readl_relaxed(priv->base + GE_PORT_EN);
+		val |= BIT(1);		/* rx*/
+		val |= BIT(2);		/* tx*/
+		writel_relaxed(val, priv->base + GE_PORT_EN);
+
+		/* enable interrupt */
+		priv->reg_inten = DEF_INT_MASK;
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+
+		/* clear rx int */
+		val = RCV_INT;
+		writel_relaxed(val, priv->base + PPE_RINT);
+
+		/* config recv int*/
+		val = BIT(6);		/* int threshold 1 package */
+		val |= 0x4;		/* recv timeout */
+		writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_INT);
+	} else {
+		/* disable int */
+		priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+
+		/* disable tx & rx */
+		val = readl_relaxed(priv->base + GE_PORT_EN);
+		val &= ~(BIT(1));	/* rx*/
+		val &= ~(BIT(2));	/* tx*/
+		writel_relaxed(val, priv->base + GE_PORT_EN);
+	}
+}
+
+static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	writel_relaxed(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
+}
+
+static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	writel_relaxed(phys, ppebase + priv->port * 4 + PPE_CFG_RX_CFF_ADDR);
+}
+
+static u32 hip04_recv_cnt(struct hip04_priv *priv)
+{
+	return readl_relaxed(priv->base + PPE_HIS_RX_PKT_CNT);
+}
+
+static void hip04_update_mac_address(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	writel_relaxed(((ndev->dev_addr[0] << 8) | (ndev->dev_addr[1])),
+			priv->base + GE_STATION_MAC_ADDRESS);
+	writel_relaxed(((ndev->dev_addr[2] << 24) | (ndev->dev_addr[3] << 16) |
+			(ndev->dev_addr[4] << 8) | (ndev->dev_addr[5])),
+			priv->base + GE_STATION_MAC_ADDRESS + 4);
+}
+
+static int hip04_set_mac_address(struct net_device *ndev, void *addr)
+{
+	eth_mac_addr(ndev, addr);
+	hip04_update_mac_address(ndev);
+	return 0;
+}
+
+static void endian_change(void *p, int size)
+{
+	unsigned int *to_cover = (unsigned int *)p;
+	int i;
+
+	size = size >> 2;
+	for (i = 0; i < size; i++)
+		*(to_cover+i) = htonl(*(to_cover+i));
+}
+
+static int hip04_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct hip04_priv *priv = container_of(napi,
+			      struct hip04_priv, napi);
+	struct net_device *ndev = priv->ndev;
+	struct sk_buff *skb;
+	struct rx_desc *desc;
+	unsigned char *buf;
+	int rx = 0;
+	unsigned int cnt = hip04_recv_cnt(priv);
+	unsigned int len, tmp[16];
+
+	while (cnt) {
+		buf = priv->rx_buf[priv->rx_head];
+		skb = build_skb(buf, priv->rx_buf_size);
+		if (unlikely(!skb))
+			net_dbg_ratelimited("build_skb failed\n");
+		dma_map_single(&ndev->dev, skb->data,
+			RX_BUF_SIZE, DMA_FROM_DEVICE);
+		memcpy(tmp, skb->data, 64);
+		endian_change((void *)tmp, 64);
+		desc = (struct rx_desc *)tmp;
+		len = desc->pkt_len;
+
+		if (len > RX_BUF_SIZE)
+			len = RX_BUF_SIZE;
+		if (0 == len)
+			break;
+
+		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+		skb_put(skb, len);
+		skb->protocol = eth_type_trans(skb, ndev);
+		napi_gro_receive(&priv->napi, skb);
+
+		buf = netdev_alloc_frag(priv->rx_buf_size);
+		if (!buf)
+			return -ENOMEM;
+		priv->rx_buf[priv->rx_head] = buf;
+		dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_TO_DEVICE);
+		hip04_set_recv_desc(priv, virt_to_phys(buf));
+
+		priv->rx_head = RX_NEXT(priv->rx_head);
+		if (rx++ >= budget)
+			break;
+
+		if (--cnt == 0)
+			cnt = hip04_recv_cnt(priv);
+	}
+
+	if (rx < budget) {
+		napi_gro_flush(napi, false);
+		__napi_complete(napi);
+	}
+
+	/* enable rx interrupt */
+	priv->reg_inten |= RCV_INT | RCV_NOBUF;
+	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+
+	return rx;
+}
+
+static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id)
+{
+	struct net_device *ndev = (struct net_device *) dev_id;
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
+	u32 val = DEF_INT_MASK;
+
+	writel_relaxed(val, priv->base + PPE_RINT);
+
+	if ((ists & RCV_INT) || (ists & RCV_NOBUF)) {
+		if (napi_schedule_prep(&priv->napi)) {
+			/* disable rx interrupt */
+			priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
+			writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+			__napi_schedule(&priv->napi);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void hip04_tx_reclaim(struct net_device *ndev, bool force)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	unsigned tx_head = priv->tx_head;
+	unsigned tx_tail = priv->tx_tail;
+	struct tx_desc *desc = priv->td_ring[priv->tx_tail];
+
+	spin_lock_irq(&priv->txlock);
+	while (tx_tail != tx_head) {
+		if (desc->send_addr != 0) {
+			if (force)
+				desc->send_addr = 0;
+			else
+				break;
+		}
+		dev_kfree_skb_irq(priv->tx_skb[tx_tail]);
+		priv->tx_skb[tx_tail] = NULL;
+		tx_tail = TX_NEXT(tx_tail);
+		priv->tx_count--;
+	}
+	priv->tx_tail = tx_tail;
+	spin_unlock_irq(&priv->txlock);
+}
+
+static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct tx_desc *desc = priv->td_ring[priv->tx_head];
+	unsigned int tx_head = priv->tx_head;
+	int ret;
+
+	hip04_tx_reclaim(ndev, false);
+
+	spin_lock_irq(&priv->txlock);
+	if (priv->tx_count++ >= TX_DESC_NUM) {
+		net_dbg_ratelimited("no TX space for packet\n");
+		netif_stop_queue(ndev);
+		ret = NETDEV_TX_BUSY;
+		goto out_unlock;
+	}
+
+	priv->tx_skb[tx_head] = skb;
+	dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
+	memset((void *)desc, 0, sizeof(*desc));
+	desc->send_addr = (unsigned int)virt_to_phys(skb->data);
+	desc->send_size = skb->len;
+	desc->cfg = DESC_DEF_CFG;
+	desc->wb_addr = priv->td_phys[tx_head];
+	endian_change(desc, 64);
+	skb_tx_timestamp(skb);
+	hip04_set_xmit_desc(priv, priv->td_phys[tx_head]);
+
+	priv->tx_head = TX_NEXT(tx_head);
+	ret = NETDEV_TX_OK;
+out_unlock:
+	spin_unlock_irq(&priv->txlock);
+
+	return ret;
+}
+
+static void hip04_adjust_link(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct phy_device *phy = priv->phy;
+
+	if ((priv->speed != phy->speed) || (priv->duplex != phy->duplex)) {
+		hip04_config_port(priv, phy->speed, phy->duplex);
+		phy_print_status(phy);
+	}
+}
+
+static int hip04_mac_open(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	hip04_reset_ppe(priv);
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		dma_map_single(&ndev->dev, priv->rx_buf[i],
+				RX_BUF_SIZE, DMA_TO_DEVICE);
+		hip04_set_recv_desc(priv, virt_to_phys(priv->rx_buf[i]));
+	}
+	priv->rx_head = 0;
+	priv->tx_head = 0;
+	priv->tx_tail = 0;
+	priv->tx_count = 0;
+
+	if (priv->phy_node) {
+		priv->phy = of_phy_connect(ndev, priv->phy_node,
+			&hip04_adjust_link, 0, PHY_INTERFACE_MODE_GMII);
+		if (!priv->phy)
+			return -ENODEV;
+		phy_start(priv->phy);
+	}
+
+	netif_start_queue(ndev);
+	hip04_mac_enable(ndev, true);
+	napi_enable(&priv->napi);
+	return 0;
+}
+
+static int hip04_mac_stop(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	if (priv->phy)
+		phy_disconnect(priv->phy);
+	priv->phy = NULL;
+
+	napi_disable(&priv->napi);
+	netif_stop_queue(ndev);
+	hip04_mac_enable(ndev, false);
+	hip04_tx_reclaim(ndev, true);
+	hip04_reset_ppe(priv);
+	return 0;
+}
+
+static void hip04_timeout(struct net_device *ndev)
+{
+	netif_wake_queue(ndev);
+	return;
+}
+
+static struct net_device_ops hip04_netdev_ops = {
+	.ndo_open		= hip04_mac_open,
+	.ndo_stop		= hip04_mac_stop,
+	.ndo_start_xmit		= hip04_mac_start_xmit,
+	.ndo_set_mac_address	= hip04_set_mac_address,
+	.ndo_tx_timeout         = hip04_timeout,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
+static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	priv->rx_buf_size = RX_BUF_SIZE +
+			    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	priv->desc_pool = dma_pool_create(DRV_NAME, d, sizeof(struct tx_desc),
+				SKB_DATA_ALIGN(sizeof(struct tx_desc)),	0);
+	if (!priv->desc_pool)
+		return -ENOMEM;
+
+	for (i = 0; i < TX_DESC_NUM; i++) {
+		priv->td_ring[i] = dma_pool_alloc(priv->desc_pool,
+					GFP_ATOMIC, &priv->td_phys[i]);
+		if (!priv->td_ring[i])
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		priv->rx_buf[i] = netdev_alloc_frag(priv->rx_buf_size);
+		if (!priv->rx_buf[i])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hip04_free_ring(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	for (i = 0; i < RX_DESC_NUM; i++)
+		if (priv->rx_buf[i])
+			put_page(virt_to_head_page(priv->rx_buf[i]));
+
+	for (i = 0; i < TX_DESC_NUM; i++) {
+		if (priv->tx_skb[i])
+			dev_kfree_skb_any(priv->tx_skb[i]);
+		if ((priv->desc_pool) && (priv->td_ring[i]))
+			dma_pool_free(priv->desc_pool, priv->td_ring[i],
+					priv->td_phys[i]);
+	}
+
+	if (priv->desc_pool)
+		dma_pool_destroy(priv->desc_pool);
+}
+
+static int hip04_mac_probe(struct platform_device *pdev)
+{
+	struct device *d = &pdev->dev;
+	struct device_node *node = d->of_node;
+	struct net_device *ndev;
+	struct hip04_priv *priv;
+	struct resource *res;
+	unsigned int irq, val;
+	int ret;
+
+	ndev = alloc_etherdev(sizeof(struct hip04_priv));
+	if (!ndev)
+		return -ENOMEM;
+
+	priv = netdev_priv(ndev);
+	priv->ndev = ndev;
+	platform_set_drvdata(pdev, ndev);
+	spin_lock_init(&priv->txlock);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		ret = -EINVAL;
+		goto init_fail;
+	}
+	ndev->base_addr = res->start;
+	priv->base = devm_ioremap_resource(d, res);
+	ret = IS_ERR(priv->base);
+	if (ret) {
+		dev_err(d, "devm_ioremap_resource failed\n");
+		goto init_fail;
+	}
+
+	if (!ppebase) {
+		struct device_node *n;
+
+		n = of_find_compatible_node(NULL, NULL, "hisilicon,hip04-ppebase");
+		if (!n) {
+			ret = -EINVAL;
+			netdev_err(ndev, "not find hisilicon,ppebase\n");
+			goto init_fail;
+		}
+		ppebase = of_iomap(n, 0);
+	}
+
+	ret = of_property_read_u32(node, "port", &val);
+	if (ret) {
+		dev_warn(d, "not find port info\n");
+		goto init_fail;
+	}
+	priv->port = val & 0x1f;
+
+	ret = of_property_read_u32(node, "speed", &val);
+	if (ret) {
+		dev_warn(d, "not find speed info\n");
+		priv->speed = SPEED_1000;
+	}
+
+	if (SPEED_100 == val)
+		priv->speed = SPEED_100;
+	else
+		priv->speed = SPEED_1000;
+	priv->duplex = DUPLEX_FULL;
+
+	ret = of_property_read_u32(node, "id", &priv->id);
+	if (ret) {
+		dev_warn(d, "not find id info\n");
+		goto init_fail;
+	}
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0) {
+		ret = -EINVAL;
+		goto init_fail;
+	}
+	ether_setup(ndev);
+	ndev->netdev_ops = &hip04_netdev_ops;
+	ndev->watchdog_timeo = TX_TIMEOUT;
+	ndev->priv_flags |= IFF_UNICAST_FLT;
+	ndev->irq = irq;
+	netif_napi_add(ndev, &priv->napi, hip04_rx_poll, RX_DESC_NUM);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	hip04_reset_ppe(priv);
+	hip04_config_port(priv, priv->speed, priv->duplex);
+	hip04_config_fifo(priv);
+	random_ether_addr(ndev->dev_addr);
+	hip04_update_mac_address(ndev);
+
+	ret = hip04_alloc_ring(ndev, d);
+	if (ret) {
+		netdev_err(ndev, "alloc ring fail\n");
+		goto alloc_fail;
+	}
+
+	ret = devm_request_irq(d, irq, hip04_mac_interrupt,
+					0, pdev->name, ndev);
+	if (ret) {
+		netdev_err(ndev, "devm_request_irq failed\n");
+		goto alloc_fail;
+	}
+
+	priv->phy_node = of_parse_phandle(node, "phy-handle", 0);
+
+	ret = register_netdev(ndev);
+	if (ret) {
+		free_netdev(ndev);
+		goto alloc_fail;
+	}
+
+	return 0;
+alloc_fail:
+	hip04_free_ring(ndev);
+init_fail:
+	of_node_put(priv->phy_node);
+	free_netdev(ndev);
+	return ret;
+}
+
+static int hip04_remove(struct platform_device *pdev)
+{
+	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	hip04_free_ring(ndev);
+	unregister_netdev(ndev);
+	free_irq(ndev->irq, ndev);
+	of_node_put(priv->phy_node);
+	free_netdev(ndev);
+
+	return 0;
+}
+
+static const struct of_device_id hip04_mac_match[] = {
+	{ .compatible = "hisilicon,hip04-mac" },
+	{ }
+};
+
+static struct platform_driver hip04_mac_driver = {
+	.probe	= hip04_mac_probe,
+	.remove	= hip04_remove,
+	.driver	= {
+		.name		= DRV_NAME,
+		.owner		= THIS_MODULE,
+		.of_match_table	= hip04_mac_match,
+	},
+};
+module_platform_driver(hip04_mac_driver);
+
+MODULE_DESCRIPTION("HISILICON P04 Ethernet driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:hip04-ether");