diff mbox

[3/3] net: hisilicon: new hip04 ethernet driver

Message ID 1396358832-15828-4-git-send-email-zhangfei.gao@linaro.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Zhangfei Gao April 1, 2014, 1:27 p.m. UTC
Support Hisilicon hip04 ethernet driver, including 100M / 1000M controller

Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
---
 drivers/net/ethernet/hisilicon/Makefile    |    2 +-
 drivers/net/ethernet/hisilicon/hip04_eth.c |  771 ++++++++++++++++++++++++++++
 2 files changed, 772 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/hisilicon/hip04_eth.c

Comments

Arnd Bergmann April 2, 2014, 9:21 a.m. UTC | #1
On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)

While it looks like there are no serious functionality bugs left, this
function is rather inefficient, as has been pointed out before:

> +{
> +       struct hip04_priv *priv = netdev_priv(ndev);
> +       struct net_device_stats *stats = &ndev->stats;
> +       unsigned int tx_head = priv->tx_head;
> +       struct tx_desc *desc = &priv->tx_desc[tx_head];
> +       dma_addr_t phys;
> +
> +       hip04_tx_reclaim(ndev, false);
> +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
> +
> +       if (priv->tx_count >= TX_DESC_NUM) {
> +               netif_stop_queue(ndev);
> +               return NETDEV_TX_BUSY;
> +       }

This is where you have two problems:

- if the descriptor ring is full, you wait for RECLAIM_PERIOD,
  which is far too long at 500ms, because during that time you
  are not able to add further data to the stopped queue.

- As David Laight pointed out earlier, you must also ensure that
  you don't have too much /data/ pending in the descriptor ring
  when you stop the queue. For a 10mbit connection, you have already
  tested (as we discussed on IRC) that 64 descriptors with 1500 byte
  frames gives you a 68ms round-trip ping time, which is too much.
  Conversely, on 1gbit, having only 64 descriptors actually seems
  a little low, and you may be able to get better throughput if
  you extend the ring to e.g. 512 descriptors.

> +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> +       if (dma_mapping_error(&ndev->dev, phys)) {
> +               dev_kfree_skb(skb);
> +               return NETDEV_TX_OK;
> +       }
> +
> +       priv->tx_skb[tx_head] = skb;
> +       priv->tx_phys[tx_head] = phys;
> +       desc->send_addr = cpu_to_be32(phys);
> +       desc->send_size = cpu_to_be16(skb->len);
> +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> +       desc->wb_addr = cpu_to_be32(phys);

One detail: since you don't have cache-coherent DMA, "desc" will
reside in uncached memory, so you try to minimize the number of accesses.
It's probably faster if you build the descriptor on the stack and
then atomically copy it over, rather than assigning each member at
a time.

The same would be true for the rx descriptors.

> +       skb_tx_timestamp(skb);
> +
> +       /* Don't wait up for transmitted skbs to be freed. */
> +       skb_orphan(skb);
> +
> +       hip04_set_xmit_desc(priv, phys);
> +       priv->tx_head = TX_NEXT(tx_head);
> +
> +       stats->tx_bytes += skb->len;
> +       stats->tx_packets++;
> +       priv->tx_count++;
> +
> +       return NETDEV_TX_OK;
> +}

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao April 2, 2014, 9:51 a.m. UTC | #2
Dear Arnd

On 04/02/2014 05:21 PM, Arnd Bergmann wrote:
> On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
>> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>
> While it looks like there are no serious functionality bugs left, this
> function is rather inefficient, as has been pointed out before:

Yes, still need more performance tuning in the next step.
We need to enable the hardware feature of cache flush, under help of 
arm-smmu, as a result dma_map_single etc can be removed.

>
>> +{
>> +       struct hip04_priv *priv = netdev_priv(ndev);
>> +       struct net_device_stats *stats = &ndev->stats;
>> +       unsigned int tx_head = priv->tx_head;
>> +       struct tx_desc *desc = &priv->tx_desc[tx_head];
>> +       dma_addr_t phys;
>> +
>> +       hip04_tx_reclaim(ndev, false);
>> +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
>> +
>> +       if (priv->tx_count >= TX_DESC_NUM) {
>> +               netif_stop_queue(ndev);
>> +               return NETDEV_TX_BUSY;
>> +       }
>
> This is where you have two problems:
>
> - if the descriptor ring is full, you wait for RECLAIM_PERIOD,
>    which is far too long at 500ms, because during that time you
>    are not able to add further data to the stopped queue.

Understand
The idea here is not using the timer as much as possible.
As experiment shows, only xmit reclaim buffers, the best throughput can 
be achieved.

>
> - As David Laight pointed out earlier, you must also ensure that
>    you don't have too much /data/ pending in the descriptor ring
>    when you stop the queue. For a 10mbit connection, you have already
>    tested (as we discussed on IRC) that 64 descriptors with 1500 byte
>    frames gives you a 68ms round-trip ping time, which is too much.

When iperf & ping running together and only ping, it is 0.7 ms.

>    Conversely, on 1gbit, having only 64 descriptors actually seems
>    a little low, and you may be able to get better throughput if
>    you extend the ring to e.g. 512 descriptors.

OK, Will check throughput of upgrade xmit descriptors.
But is it said not using too much descripors for xmit since no xmit 
interrupt?

>
>> +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> +       if (dma_mapping_error(&ndev->dev, phys)) {
>> +               dev_kfree_skb(skb);
>> +               return NETDEV_TX_OK;
>> +       }
>> +
>> +       priv->tx_skb[tx_head] = skb;
>> +       priv->tx_phys[tx_head] = phys;
>> +       desc->send_addr = cpu_to_be32(phys);
>> +       desc->send_size = cpu_to_be16(skb->len);
>> +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
>> +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
>> +       desc->wb_addr = cpu_to_be32(phys);
>
> One detail: since you don't have cache-coherent DMA, "desc" will
> reside in uncached memory, so you try to minimize the number of accesses.
> It's probably faster if you build the descriptor on the stack and
> then atomically copy it over, rather than assigning each member at
> a time.

I am sorry, not quite understand, could you clarify more?
The phys and size etc of skb->data is changing, so need to assign.
If member contents keep constant, it can be set when initializing.

>
> The same would be true for the rx descriptors.
>

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight April 2, 2014, 10:04 a.m. UTC | #3
From: Arnd Bergmann
> On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
> > +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> 
> While it looks like there are no serious functionality bugs left, this
> function is rather inefficient, as has been pointed out before:
> 
> > +{
> > +       struct hip04_priv *priv = netdev_priv(ndev);
> > +       struct net_device_stats *stats = &ndev->stats;
> > +       unsigned int tx_head = priv->tx_head;
> > +       struct tx_desc *desc = &priv->tx_desc[tx_head];
> > +       dma_addr_t phys;
> > +
> > +       hip04_tx_reclaim(ndev, false);
> > +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
> > +
> > +       if (priv->tx_count >= TX_DESC_NUM) {
> > +               netif_stop_queue(ndev);
> > +               return NETDEV_TX_BUSY;
> > +       }
> 
> This is where you have two problems:
> 
> - if the descriptor ring is full, you wait for RECLAIM_PERIOD,
>   which is far too long at 500ms, because during that time you
>   are not able to add further data to the stopped queue.

Best to have some idea how long it will take for the ring to empty.
IIRC you need a count of the bytes in the tx ring anyway.
There isn't much point waking up until most of the queued
transmits have had time to complete.

> - As David Laight pointed out earlier, you must also ensure that
>   you don't have too much /data/ pending in the descriptor ring
>   when you stop the queue. For a 10mbit connection, you have already
>   tested (as we discussed on IRC) that 64 descriptors with 1500 byte
>   frames gives you a 68ms round-trip ping time, which is too much.
>   Conversely, on 1gbit, having only 64 descriptors actually seems
>   a little low, and you may be able to get better throughput if
>   you extend the ring to e.g. 512 descriptors.

The descriptor count matters most for small packets.
There are workloads (I've got one) that can send 1000s of small packets/sec
on a single TCP connection (there will be receive traffic).

> > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> > +       if (dma_mapping_error(&ndev->dev, phys)) {
> > +               dev_kfree_skb(skb);
> > +               return NETDEV_TX_OK;
> > +       }
> > +
> > +       priv->tx_skb[tx_head] = skb;
> > +       priv->tx_phys[tx_head] = phys;
> > +       desc->send_addr = cpu_to_be32(phys);
> > +       desc->send_size = cpu_to_be16(skb->len);
> > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> > +       desc->wb_addr = cpu_to_be32(phys);
> 
> One detail: since you don't have cache-coherent DMA, "desc" will
> reside in uncached memory, so you try to minimize the number of accesses.
> It's probably faster if you build the descriptor on the stack and
> then atomically copy it over, rather than assigning each member at
> a time.

I'm not sure, the writes to uncached memory will probably be
asynchronous, but you may avoid a stall by separating the
cycles in time.
What you need to avoid is reads from uncached memory.
It may well beneficial for the tx reclaim code to first
check whether all the transmits have completed (likely)
instead of testing each descriptor in turn.

> The same would be true for the rx descriptors.

Actually it is reasonably feasible to put the rx descriptors
in cacheable memory and to flush the cache lines after adding
new entries.
You just need to add the entries one cache line full at a time
(and ensure that the rx processing code doesn't dirty the line).

Without cache-coherent memory cached tx descriptors are much harder work.

	David





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann April 2, 2014, 3:24 p.m. UTC | #4
On Wednesday 02 April 2014 17:51:54 zhangfei wrote:
> Dear Arnd
> 
> On 04/02/2014 05:21 PM, Arnd Bergmann wrote:
> > On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
> >> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> >
> > While it looks like there are no serious functionality bugs left, this
> > function is rather inefficient, as has been pointed out before:
> 
> Yes, still need more performance tuning in the next step.
> We need to enable the hardware feature of cache flush, under help of 
> arm-smmu, as a result dma_map_single etc can be removed.

You cannot remove the dma_map_single call here, but the implementation
of that function will be different when you use the iommu_coherent_ops:
Instead of flushing the caches, it will create or remove an iommu entry
and return the bus address.

I remember you mentioned before that using the iommu on this particular
SoC actually gives you cache-coherent DMA, so you may also be able
to use arm_coherent_dma_ops if you can set up a static 1:1 mapping 
between bus and phys addresses.

> >> +{
> >> +       struct hip04_priv *priv = netdev_priv(ndev);
> >> +       struct net_device_stats *stats = &ndev->stats;
> >> +       unsigned int tx_head = priv->tx_head;
> >> +       struct tx_desc *desc = &priv->tx_desc[tx_head];
> >> +       dma_addr_t phys;
> >> +
> >> +       hip04_tx_reclaim(ndev, false);
> >> +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
> >> +
> >> +       if (priv->tx_count >= TX_DESC_NUM) {
> >> +               netif_stop_queue(ndev);
> >> +               return NETDEV_TX_BUSY;
> >> +       }
> >
> > This is where you have two problems:
> >
> > - if the descriptor ring is full, you wait for RECLAIM_PERIOD,
> >    which is far too long at 500ms, because during that time you
> >    are not able to add further data to the stopped queue.
> 
> Understand
> The idea here is not using the timer as much as possible.
> As experiment shows, only xmit reclaim buffers, the best throughput can 
> be achieved.

I'm only talking about the case where that doesn't work: once you stop
the queue, the xmit function won't get called again until the timer
causes the reclaim be done and restart the queue.

> > - As David Laight pointed out earlier, you must also ensure that
> >    you don't have too much /data/ pending in the descriptor ring
> >    when you stop the queue. For a 10mbit connection, you have already
> >    tested (as we discussed on IRC) that 64 descriptors with 1500 byte
> >    frames gives you a 68ms round-trip ping time, which is too much.
> 
> When iperf & ping running together and only ping, it is 0.7 ms.
> 
> >    Conversely, on 1gbit, having only 64 descriptors actually seems
> >    a little low, and you may be able to get better throughput if
> >    you extend the ring to e.g. 512 descriptors.
> 
> OK, Will check throughput of upgrade xmit descriptors.
> But is it said not using too much descripors for xmit since no xmit 
> interrupt?

The important part is to limit the time that data spends in the queue,
which is a function of the interface tx speed and the number of bytes
in the queue.

> >> +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> >> +       if (dma_mapping_error(&ndev->dev, phys)) {
> >> +               dev_kfree_skb(skb);
> >> +               return NETDEV_TX_OK;
> >> +       }
> >> +
> >> +       priv->tx_skb[tx_head] = skb;
> >> +       priv->tx_phys[tx_head] = phys;
> >> +       desc->send_addr = cpu_to_be32(phys);
> >> +       desc->send_size = cpu_to_be16(skb->len);
> >> +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> >> +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> >> +       desc->wb_addr = cpu_to_be32(phys);
> >
> > One detail: since you don't have cache-coherent DMA, "desc" will
> > reside in uncached memory, so you try to minimize the number of accesses.
> > It's probably faster if you build the descriptor on the stack and
> > then atomically copy it over, rather than assigning each member at
> > a time.
> 
> I am sorry, not quite understand, could you clarify more?
> The phys and size etc of skb->data is changing, so need to assign.
> If member contents keep constant, it can be set when initializing.

I meant you should use 64-bit accesses here instead of multiple 32 and
16 bit accesses, but as David noted, it's actually not that much of
a deal for the writes as it is for the reads from uncached memory.

The important part is to avoid the line where you do 'if (desc->send_addr
!= 0)' as much as possible.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann April 2, 2014, 3:49 p.m. UTC | #5
On Wednesday 02 April 2014 10:04:34 David Laight wrote:
> From: Arnd Bergmann
> > On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
> > > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> > > +       if (dma_mapping_error(&ndev->dev, phys)) {
> > > +               dev_kfree_skb(skb);
> > > +               return NETDEV_TX_OK;
> > > +       }
> > > +
> > > +       priv->tx_skb[tx_head] = skb;
> > > +       priv->tx_phys[tx_head] = phys;
> > > +       desc->send_addr = cpu_to_be32(phys);
> > > +       desc->send_size = cpu_to_be16(skb->len);
> > > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> > > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> > > +       desc->wb_addr = cpu_to_be32(phys);
> > 
> > One detail: since you don't have cache-coherent DMA, "desc" will
> > reside in uncached memory, so you try to minimize the number of accesses.
> > It's probably faster if you build the descriptor on the stack and
> > then atomically copy it over, rather than assigning each member at
> > a time.
> 
> I'm not sure, the writes to uncached memory will probably be
> asynchronous, but you may avoid a stall by separating the
> cycles in time.

Right.

> What you need to avoid is reads from uncached memory.
> It may well beneficial for the tx reclaim code to first
> check whether all the transmits have completed (likely)
> instead of testing each descriptor in turn.

Good point, reading from noncached memory is actually the
part that matters. For slow networks (e.g. 10mbit), checking if
all of the descriptors have finished is not quite as likely to succeed
as for fast (gbit), especially if the timeout is set to expire
before all descriptors have completed.

If it makes a lot of difference to performance, one could use
a binary search over the outstanding descriptors rather than looking
just at the last one.

> > The same would be true for the rx descriptors.
> 
> Actually it is reasonably feasible to put the rx descriptors
> in cacheable memory and to flush the cache lines after adding
> new entries.
> You just need to add the entries one cache line full at a time
> (and ensure that the rx processing code doesn't dirty the line).

rx descriptors are already using the streaming mapping, so ignore
what I said about them.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao April 3, 2014, 6:24 a.m. UTC | #6
On Wed, Apr 2, 2014 at 11:49 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Wednesday 02 April 2014 10:04:34 David Laight wrote:
>> From: Arnd Bergmann
>> > On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
>> > > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> > > +       if (dma_mapping_error(&ndev->dev, phys)) {
>> > > +               dev_kfree_skb(skb);
>> > > +               return NETDEV_TX_OK;
>> > > +       }
>> > > +
>> > > +       priv->tx_skb[tx_head] = skb;
>> > > +       priv->tx_phys[tx_head] = phys;
>> > > +       desc->send_addr = cpu_to_be32(phys);
>> > > +       desc->send_size = cpu_to_be16(skb->len);
>> > > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
>> > > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
>> > > +       desc->wb_addr = cpu_to_be32(phys);
>> >
>> > One detail: since you don't have cache-coherent DMA, "desc" will
>> > reside in uncached memory, so you try to minimize the number of accesses.
>> > It's probably faster if you build the descriptor on the stack and
>> > then atomically copy it over, rather than assigning each member at
>> > a time.
>>
>> I'm not sure, the writes to uncached memory will probably be
>> asynchronous, but you may avoid a stall by separating the
>> cycles in time.
>
> Right.
>
>> What you need to avoid is reads from uncached memory.
>> It may well beneficial for the tx reclaim code to first
>> check whether all the transmits have completed (likely)
>> instead of testing each descriptor in turn.
>
> Good point, reading from noncached memory is actually the
> part that matters. For slow networks (e.g. 10mbit), checking if
> all of the descriptors have finished is not quite as likely to succeed
> as for fast (gbit), especially if the timeout is set to expire
> before all descriptors have completed.
>
> If it makes a lot of difference to performance, one could use
> a binary search over the outstanding descriptors rather than looking
> just at the last one.
>

I am afraid, there may no simple way to check whether all transmits completed.

Still want enable the cache coherent feature first.
Then two benefits:
1. dma buffer cacheable.
2. descriptor can directly use cacheable memory, so the performance
concern here may be solved accordingly.

So how about using this version as first version, while tuning the
performance in the next step.
Currently, the gbit interface can reach 420M bits/s in iperf, and the
100M interface can reach 94M bits/s.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann April 3, 2014, 8:35 a.m. UTC | #7
On Thursday 03 April 2014 14:24:25 Zhangfei Gao wrote:
> On Wed, Apr 2, 2014 at 11:49 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> > On Wednesday 02 April 2014 10:04:34 David Laight wrote:
> >> What you need to avoid is reads from uncached memory.
> >> It may well beneficial for the tx reclaim code to first
> >> check whether all the transmits have completed (likely)
> >> instead of testing each descriptor in turn.
> >
> > Good point, reading from noncached memory is actually the
> > part that matters. For slow networks (e.g. 10mbit), checking if
> > all of the descriptors have finished is not quite as likely to succeed
> > as for fast (gbit), especially if the timeout is set to expire
> > before all descriptors have completed.
> >
> > If it makes a lot of difference to performance, one could use
> > a binary search over the outstanding descriptors rather than looking
> > just at the last one.
> >
> 
> I am afraid, there may no simple way to check whether all transmits completed.

Why can't you do the trivial change that David suggested above? It
sounds like a three line change to your current code. No need to do
the binary change at first, just see what difference it makes.

> Still want enable the cache coherent feature first.
> Then two benefits:
> 1. dma buffer cacheable.
> 2. descriptor can directly use cacheable memory, so the performance
> concern here may be solved accordingly.
> 
> So how about using this version as first version, while tuning the
> performance in the next step.
> Currently, the gbit interface can reach 420M bits/s in iperf, and the
> 100M interface can reach 94M bits/s.

It sounds like a very simple thing to try and you'd know immediately
if it helps or not.

Besides, you still have to change the other two issues I mentioned
regarding the tx reclaim, so you can do all three at once.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 3, 2014, 3:22 p.m. UTC | #8
From: David Laight <David.Laight@ACULAB.COM>
Date: Wed, 2 Apr 2014 10:04:34 +0000

> From: Arnd Bergmann
>> On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
>> > +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>> 
>> While it looks like there are no serious functionality bugs left, this
>> function is rather inefficient, as has been pointed out before:
>> 
>> > +{
>> > +       struct hip04_priv *priv = netdev_priv(ndev);
>> > +       struct net_device_stats *stats = &ndev->stats;
>> > +       unsigned int tx_head = priv->tx_head;
>> > +       struct tx_desc *desc = &priv->tx_desc[tx_head];
>> > +       dma_addr_t phys;
>> > +
>> > +       hip04_tx_reclaim(ndev, false);
>> > +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
>> > +
>> > +       if (priv->tx_count >= TX_DESC_NUM) {
>> > +               netif_stop_queue(ndev);
>> > +               return NETDEV_TX_BUSY;
>> > +       }
>> 
>> This is where you have two problems:
>> 
>> - if the descriptor ring is full, you wait for RECLAIM_PERIOD,
>>   which is far too long at 500ms, because during that time you
>>   are not able to add further data to the stopped queue.
> 
> Best to have some idea how long it will take for the ring to empty.
> IIRC you need a count of the bytes in the tx ring anyway.
> There isn't much point waking up until most of the queued
> transmits have had time to complete.

There is absolutely no doubt in my mind that you cannot use timers
for this, it simply will never work properly.

There needs to be a real notification from the device of some
sort, and I don't care what form that comes in.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Russell King - ARM Linux April 3, 2014, 3:27 p.m. UTC | #9
On Wed, Apr 02, 2014 at 11:21:45AM +0200, Arnd Bergmann wrote:
> - As David Laight pointed out earlier, you must also ensure that
>   you don't have too much /data/ pending in the descriptor ring
>   when you stop the queue. For a 10mbit connection, you have already
>   tested (as we discussed on IRC) that 64 descriptors with 1500 byte
>   frames gives you a 68ms round-trip ping time, which is too much.
>   Conversely, on 1gbit, having only 64 descriptors actually seems
>   a little low, and you may be able to get better throughput if
>   you extend the ring to e.g. 512 descriptors.

You don't manage that by stopping the queue - there's separate interfaces
where you report how many bytes you've queued (netdev_sent_queue()) and
how many bytes/packets you've sent (netdev_tx_completed_queue()).  This
allows the netdev schedulers to limit how much data is held in the queue,
preserving interactivity while allowing the advantages of larger rings.

> > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> > +       if (dma_mapping_error(&ndev->dev, phys)) {
> > +               dev_kfree_skb(skb);
> > +               return NETDEV_TX_OK;
> > +       }
> > +
> > +       priv->tx_skb[tx_head] = skb;
> > +       priv->tx_phys[tx_head] = phys;
> > +       desc->send_addr = cpu_to_be32(phys);
> > +       desc->send_size = cpu_to_be16(skb->len);
> > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> > +       desc->wb_addr = cpu_to_be32(phys);
> 
> One detail: since you don't have cache-coherent DMA, "desc" will
> reside in uncached memory, so you try to minimize the number of accesses.
> It's probably faster if you build the descriptor on the stack and
> then atomically copy it over, rather than assigning each member at
> a time.

DMA coherent memory is write combining, so multiple writes will be
coalesced.  This also means that barriers may be required to ensure the
descriptors are pushed out in a timely manner if something like writel()
is not used in the transmit-triggering path.
Zhangfei Gao April 3, 2014, 3:38 p.m. UTC | #10
Dear David

On 04/02/2014 06:04 PM, David Laight wrote:
> From: Arnd Bergmann
>> On Tuesday 01 April 2014 21:27:12 Zhangfei Gao wrote:
>>> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>>
>> While it looks like there are no serious functionality bugs left, this
>> function is rather inefficient, as has been pointed out before:
>>
>>> +{
>>> +       struct hip04_priv *priv = netdev_priv(ndev);
>>> +       struct net_device_stats *stats = &ndev->stats;
>>> +       unsigned int tx_head = priv->tx_head;
>>> +       struct tx_desc *desc = &priv->tx_desc[tx_head];
>>> +       dma_addr_t phys;
>>> +
>>> +       hip04_tx_reclaim(ndev, false);
>>> +       mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
>>> +
>>> +       if (priv->tx_count >= TX_DESC_NUM) {
>>> +               netif_stop_queue(ndev);
>>> +               return NETDEV_TX_BUSY;
>>> +       }
>>
>> This is where you have two problems:
>>
>> - if the descriptor ring is full, you wait for RECLAIM_PERIOD,
>>    which is far too long at 500ms, because during that time you
>>    are not able to add further data to the stopped queue.
>
> Best to have some idea how long it will take for the ring to empty.
> IIRC you need a count of the bytes in the tx ring anyway.
> There isn't much point waking up until most of the queued
> transmits have had time to complete.

In fact, there is no good way to check when the packed is send out 
except check desc becomes 0, even this is not accurate in 100M mode.
The hardware guy just suggest we can assume the data is send out when 
send to the dma.
Though it can work with iperf, still heistate to use it.

>
>> - As David Laight pointed out earlier, you must also ensure that
>>    you don't have too much /data/ pending in the descriptor ring
>>    when you stop the queue. For a 10mbit connection, you have already
>>    tested (as we discussed on IRC) that 64 descriptors with 1500 byte
>>    frames gives you a 68ms round-trip ping time, which is too much.
>>    Conversely, on 1gbit, having only 64 descriptors actually seems
>>    a little low, and you may be able to get better throughput if
>>    you extend the ring to e.g. 512 descriptors.
>
> The descriptor count matters most for small packets.
> There are workloads (I've got one) that can send 1000s of small packets/sec
> on a single TCP connection (there will be receive traffic).
>
>>> +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>>> +       if (dma_mapping_error(&ndev->dev, phys)) {
>>> +               dev_kfree_skb(skb);
>>> +               return NETDEV_TX_OK;
>>> +       }
>>> +
>>> +       priv->tx_skb[tx_head] = skb;
>>> +       priv->tx_phys[tx_head] = phys;
>>> +       desc->send_addr = cpu_to_be32(phys);
>>> +       desc->send_size = cpu_to_be16(skb->len);
>>> +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
>>> +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
>>> +       desc->wb_addr = cpu_to_be32(phys);
>>
>> One detail: since you don't have cache-coherent DMA, "desc" will
>> reside in uncached memory, so you try to minimize the number of accesses.
>> It's probably faster if you build the descriptor on the stack and
>> then atomically copy it over, rather than assigning each member at
>> a time.
>
> I'm not sure, the writes to uncached memory will probably be
> asynchronous, but you may avoid a stall by separating the
> cycles in time.
> What you need to avoid is reads from uncached memory.
> It may well beneficial for the tx reclaim code to first
> check whether all the transmits have completed (likely)
> instead of testing each descriptor in turn.

It may not needed since no better way to check whether all packets and 
send out.

For 100M interface, the perf is 94M, almost no space to upgrade.
For 1G interface, it is rather fast, when check with iperf by default.
There is only 1 buffer used, so every time free only 1 buffer.

However, just find the throughput improves a lot when increasing thread.
iperf -P 1, throughput: 420M
iperf -P 2, throughput: 740M
iperf -P 4, throughput: 930M


>
>> The same would be true for the rx descriptors.
>
> Actually it is reasonably feasible to put the rx descriptors
> in cacheable memory and to flush the cache lines after adding
> new entries.
> You just need to add the entries one cache line full at a time
> (and ensure that the rx processing code doesn't dirty the line).
>
> Without cache-coherent memory cached tx descriptors are much harder work.
>
> 	David
>
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight April 3, 2014, 3:42 p.m. UTC | #11
From: Russell King - ARM Linux
> DMA coherent memory is write combining, so multiple writes will be
> coalesced.  This also means that barriers may be required to ensure the
> descriptors are pushed out in a timely manner if something like writel()
> is not used in the transmit-triggering path.

You also have to ensure that the write that changes the 'owner'
bit is the one that happens last.

If (for example) a descriptor has two words, one containing the
buffer address and the other containing the length and flags,
then you have to absolutely ensure that the hardware will not
read the new 'flags' with the old 'buffer address'.
Any write to tell the hardware to look at the tx ring won't
help you - the hardware might be reading the descriptor anyway.

Even if the accesses are uncached, you need the appropriate
barrier (or volatiles) to stop gcc reordering the writes.
(I think accesses to volatiles can't be reordered - check.)

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Russell King - ARM Linux April 3, 2014, 3:50 p.m. UTC | #12
On Thu, Apr 03, 2014 at 03:42:00PM +0000, David Laight wrote:
> From: Russell King - ARM Linux
> > DMA coherent memory is write combining, so multiple writes will be
> > coalesced.  This also means that barriers may be required to ensure the
> > descriptors are pushed out in a timely manner if something like writel()
> > is not used in the transmit-triggering path.
> 
> You also have to ensure that the write that changes the 'owner'
> bit is the one that happens last.
> 
> If (for example) a descriptor has two words, one containing the
> buffer address and the other containing the length and flags,
> then you have to absolutely ensure that the hardware will not
> read the new 'flags' with the old 'buffer address'.
> Any write to tell the hardware to look at the tx ring won't
> help you - the hardware might be reading the descriptor anyway.
> 
> Even if the accesses are uncached, you need the appropriate
> barrier (or volatiles) to stop gcc reordering the writes.
> (I think accesses to volatiles can't be reordered - check.)

Exactly... I wish Freescale were as thoughtful as you are on this point. :)
Arnd Bergmann April 3, 2014, 5:57 p.m. UTC | #13
On Thursday 03 April 2014 16:27:46 Russell King - ARM Linux wrote:
> On Wed, Apr 02, 2014 at 11:21:45AM +0200, Arnd Bergmann wrote:
> > - As David Laight pointed out earlier, you must also ensure that
> >   you don't have too much /data/ pending in the descriptor ring
> >   when you stop the queue. For a 10mbit connection, you have already
> >   tested (as we discussed on IRC) that 64 descriptors with 1500 byte
> >   frames gives you a 68ms round-trip ping time, which is too much.
> >   Conversely, on 1gbit, having only 64 descriptors actually seems
> >   a little low, and you may be able to get better throughput if
> >   you extend the ring to e.g. 512 descriptors.
> 
> You don't manage that by stopping the queue - there's separate interfaces
> where you report how many bytes you've queued (netdev_sent_queue()) and
> how many bytes/packets you've sent (netdev_tx_completed_queue()).  This
> allows the netdev schedulers to limit how much data is held in the queue,
> preserving interactivity while allowing the advantages of larger rings.

Ah, I didn't know about these.  However, reading through the dql code,
it seems that will not work if the tx reclaim is triggered by a timer,
since it expects to get feedback from the actual hardware behavior. :(

I guess this is (part of) what David Miller also meant by saying it won't
ever work properly. 

> > > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
> > > +       if (dma_mapping_error(&ndev->dev, phys)) {
> > > +               dev_kfree_skb(skb);
> > > +               return NETDEV_TX_OK;
> > > +       }
> > > +
> > > +       priv->tx_skb[tx_head] = skb;
> > > +       priv->tx_phys[tx_head] = phys;
> > > +       desc->send_addr = cpu_to_be32(phys);
> > > +       desc->send_size = cpu_to_be16(skb->len);
> > > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
> > > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
> > > +       desc->wb_addr = cpu_to_be32(phys);
> > 
> > One detail: since you don't have cache-coherent DMA, "desc" will
> > reside in uncached memory, so you try to minimize the number of accesses.
> > It's probably faster if you build the descriptor on the stack and
> > then atomically copy it over, rather than assigning each member at
> > a time.
> 
> DMA coherent memory is write combining, so multiple writes will be
> coalesced.  This also means that barriers may be required to ensure the
> descriptors are pushed out in a timely manner if something like writel()
> is not used in the transmit-triggering path.

Right, makes sense. There is a writel() right after this, so no need
for extra barriers. We already concluded that the store operation on
uncached memory isn't actually a problem, and Zhangfei Gao did some
measurements to check the overhead of the one read from uncached
memory that is in the tx path, which was lost in the noise.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhangfei Gao April 4, 2014, 6:52 a.m. UTC | #14
Dear Russell

On Thu, Apr 3, 2014 at 11:27 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Wed, Apr 02, 2014 at 11:21:45AM +0200, Arnd Bergmann wrote:
>> - As David Laight pointed out earlier, you must also ensure that
>>   you don't have too much /data/ pending in the descriptor ring
>>   when you stop the queue. For a 10mbit connection, you have already
>>   tested (as we discussed on IRC) that 64 descriptors with 1500 byte
>>   frames gives you a 68ms round-trip ping time, which is too much.
>>   Conversely, on 1gbit, having only 64 descriptors actually seems
>>   a little low, and you may be able to get better throughput if
>>   you extend the ring to e.g. 512 descriptors.
>
> You don't manage that by stopping the queue - there's separate interfaces
> where you report how many bytes you've queued (netdev_sent_queue()) and
> how many bytes/packets you've sent (netdev_tx_completed_queue()).  This
> allows the netdev schedulers to limit how much data is held in the queue,
> preserving interactivity while allowing the advantages of larger rings.

My god, it's awesome.
The latency can be solved via adding netdev_sent_queue in xmit, and
netdev_completed_queue in reclaim.
In the experiment,
iperf -P 3 could get 930M, and ping could get response within 0.4 ms
in the meantime.

Is that mean the timer -> reclaim should be removed at all?
The background are.
1. No xmit_complete interrupt.
2. Only xmit call reclaim used buffer can achieve best throughput.
Adding timer in case no next xmit for reclaim.

>
>> > +       phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> > +       if (dma_mapping_error(&ndev->dev, phys)) {
>> > +               dev_kfree_skb(skb);
>> > +               return NETDEV_TX_OK;
>> > +       }
>> > +
>> > +       priv->tx_skb[tx_head] = skb;
>> > +       priv->tx_phys[tx_head] = phys;
>> > +       desc->send_addr = cpu_to_be32(phys);
>> > +       desc->send_size = cpu_to_be16(skb->len);
>> > +       desc->cfg = cpu_to_be32(DESC_DEF_CFG);
>> > +       phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
>> > +       desc->wb_addr = cpu_to_be32(phys);
>>
>> One detail: since you don't have cache-coherent DMA, "desc" will
>> reside in uncached memory, so you try to minimize the number of accesses.
>> It's probably faster if you build the descriptor on the stack and
>> then atomically copy it over, rather than assigning each member at
>> a time.
>
> DMA coherent memory is write combining, so multiple writes will be
> coalesced.  This also means that barriers may be required to ensure the
> descriptors are pushed out in a timely manner if something like writel()
> is not used in the transmit-triggering path.
>
Currently writel is used in xmit.
And regmap_write -> writel is used in poll.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/hisilicon/Makefile b/drivers/net/ethernet/hisilicon/Makefile
index 1d6eb6e..17dec03 100644
--- a/drivers/net/ethernet/hisilicon/Makefile
+++ b/drivers/net/ethernet/hisilicon/Makefile
@@ -2,4 +2,4 @@ 
 # Makefile for the HISILICON network device drivers.
 #
 
-obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o
+obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o hip04_eth.o
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
new file mode 100644
index 0000000..db00337
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -0,0 +1,771 @@ 
+
+/* Copyright (c) 2014 Linaro Ltd.
+ * Copyright (c) 2014 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/of_address.h>
+#include <linux/phy.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#define PPE_CFG_RX_ADDR			0x100
+#define PPE_CFG_POOL_GRP		0x300
+#define PPE_CFG_RX_BUF_SIZE		0x400
+#define PPE_CFG_RX_FIFO_SIZE		0x500
+#define PPE_CURR_BUF_CNT		0xa200
+
+#define GE_DUPLEX_TYPE			0x8
+#define GE_MAX_FRM_SIZE_REG		0x3c
+#define GE_PORT_MODE			0x40
+#define GE_PORT_EN			0x44
+#define GE_SHORT_RUNTS_THR_REG		0x50
+#define GE_TX_LOCAL_PAGE_REG		0x5c
+#define GE_TRANSMIT_CONTROL_REG		0x60
+#define GE_CF_CRC_STRIP_REG		0x1b0
+#define GE_MODE_CHANGE_EN		0x1b4
+#define GE_RECV_CONTROL_REG		0x1e0
+#define GE_STATION_MAC_ADDRESS		0x210
+#define PPE_CFG_TX_PKT_BD_ADDR		0x420
+#define PPE_CFG_MAX_FRAME_LEN_REG	0x408
+#define PPE_CFG_BUS_CTRL_REG		0x424
+#define PPE_CFG_RX_CTRL_REG		0x428
+#define PPE_CFG_RX_PKT_MODE_REG		0x438
+#define PPE_CFG_QOS_VMID_GEN		0x500
+#define PPE_CFG_RX_PKT_INT		0x538
+#define PPE_INTEN			0x600
+#define PPE_INTSTS			0x608
+#define PPE_RINT			0x604
+#define PPE_CFG_STS_MODE		0x700
+#define PPE_HIS_RX_PKT_CNT		0x804
+
+/* REG_INTERRUPT */
+#define RCV_INT				BIT(10)
+#define RCV_NOBUF			BIT(8)
+#define DEF_INT_MASK			(RCV_INT | RCV_NOBUF)
+
+#define RX_DESC_NUM			64
+#define TX_DESC_NUM			64
+#define TX_NEXT(N)			(((N) + 1) & (TX_DESC_NUM-1))
+#define RX_NEXT(N)			(((N) + 1) & (RX_DESC_NUM-1))
+
+#define GMAC_PPE_RX_PKT_MAX_LEN		379
+#define GMAC_MAX_PKT_LEN		1516
+#define DESC_DEF_CFG			0x14
+#define RX_BUF_SIZE			1600
+#define RX_PKT_ERR			0x3
+#define TX_TIMEOUT			(6 * HZ)
+#define RECLAIM_PERIOD			HZ
+
+#define DRV_NAME			"hip04-ether"
+
+struct tx_desc {
+	u32 send_addr;
+	u16 reserved_16;
+	u16 send_size;
+	u32 reserved_32;
+	u32 cfg;
+	u32 wb_addr;
+} ____cacheline_aligned;
+
+struct rx_desc {
+	u16 reserved_16;
+	u16 pkt_len;
+	u32 reserve1[3];
+	u32 pkt_err;
+	u32 reserve2[4];
+};
+
+struct hip04_priv {
+	void __iomem *base;
+	int phy_mode;
+	int chan;
+	unsigned int port;
+	unsigned int speed;
+	unsigned int duplex;
+	unsigned int reg_inten;
+
+	struct napi_struct napi;
+	struct net_device *ndev;
+
+	struct tx_desc *tx_desc;
+	dma_addr_t tx_desc_dma;
+	struct sk_buff *tx_skb[TX_DESC_NUM];
+	dma_addr_t tx_phys[TX_DESC_NUM];
+	spinlock_t lock;
+	unsigned int tx_head;
+	unsigned int tx_tail;
+	unsigned int tx_count;
+
+	unsigned char *rx_buf[RX_DESC_NUM];
+	dma_addr_t rx_phys[RX_DESC_NUM];
+	unsigned int rx_head;
+	unsigned int rx_buf_size;
+
+	struct device_node *phy_node;
+	struct phy_device *phy;
+	struct regmap *map;
+	struct timer_list txtimer;
+};
+
+static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	priv->speed = speed;
+	priv->duplex = duplex;
+
+	switch (priv->phy_mode) {
+	case PHY_INTERFACE_MODE_SGMII:
+		if (speed == SPEED_1000)
+			val = 8;
+		else if (speed == SPEED_100)
+			val = 7;
+		else
+			val = 6;	/* SPEED_10 */
+		break;
+	case PHY_INTERFACE_MODE_MII:
+		if (speed == SPEED_100)
+			val = 1;
+		else
+			val = 0;	/* SPEED_10 */
+		break;
+	default:
+		netdev_warn(ndev, "not supported mode\n");
+		val = 0;
+		break;
+	}
+	writel_relaxed(val, priv->base + GE_PORT_MODE);
+
+	val = (duplex) ? BIT(0) : 0;
+	writel_relaxed(val, priv->base + GE_DUPLEX_TYPE);
+
+	val = BIT(0);
+	writel_relaxed(val, priv->base + GE_MODE_CHANGE_EN);
+}
+
+static void hip04_reset_ppe(struct hip04_priv *priv)
+{
+	u32 val, tmp;
+
+	do {
+		regmap_read(priv->map, priv->port * 4 + PPE_CURR_BUF_CNT, &val);
+		regmap_read(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, &tmp);
+	} while (val & 0xfff);
+}
+
+static void hip04_config_fifo(struct hip04_priv *priv)
+{
+	u32 val;
+
+	val = readl_relaxed(priv->base + PPE_CFG_STS_MODE);
+	val |= BIT(12);			/* PPE_HIS_RX_PKT_CNT read clear */
+	writel_relaxed(val, priv->base + PPE_CFG_STS_MODE);
+
+	val = BIT(priv->port);
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_POOL_GRP, val);
+
+	val = priv->port << 8;
+	val |= BIT(14);
+	writel_relaxed(val, priv->base + PPE_CFG_QOS_VMID_GEN);
+
+	val = RX_BUF_SIZE;
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_BUF_SIZE, val);
+
+	val = RX_DESC_NUM << 16;	/* depth */
+	val |= BIT(11);			/* seq: first set first ues */
+	val |= RX_DESC_NUM * priv->chan;	/* start_addr */
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_FIFO_SIZE, val);
+
+	/* pkt store format */
+	val = NET_IP_ALIGN << 11;	/* align */
+	writel_relaxed(val, priv->base + PPE_CFG_RX_CTRL_REG);
+
+	/* following cfg required for 1000M */
+	/* pkt mode */
+	val = BIT(18);			/* align */
+	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_MODE_REG);
+
+	/* set bus ctrl */
+	val = BIT(14);			/* buffer locally release */
+	val |= BIT(0);			/* big endian */
+	writel_relaxed(val, priv->base + PPE_CFG_BUS_CTRL_REG);
+
+	/* set max pkt len, curtail if exceed */
+	val = GMAC_PPE_RX_PKT_MAX_LEN;	/* max buffer len */
+	writel_relaxed(val, priv->base + PPE_CFG_MAX_FRAME_LEN_REG);
+
+	/* set max len of each pkt */
+	val = GMAC_MAX_PKT_LEN;		/* max buffer len */
+	writel_relaxed(val, priv->base + GE_MAX_FRM_SIZE_REG);
+
+	/* set min len of each pkt */
+	val = 31;			/* min buffer len */
+	writel_relaxed(val, priv->base + GE_SHORT_RUNTS_THR_REG);
+
+	/* tx */
+	val = readl_relaxed(priv->base + GE_TRANSMIT_CONTROL_REG);
+	val |= BIT(5);			/* tx auto neg */
+	val |= BIT(6);			/* tx add crc */
+	val |= BIT(7);			/* tx short pad through */
+	writel_relaxed(val, priv->base + GE_TRANSMIT_CONTROL_REG);
+
+	/* rx crc */
+	val = BIT(0);			/* rx strip crc */
+	writel_relaxed(val, priv->base + GE_CF_CRC_STRIP_REG);
+
+	/* rx */
+	val = readl_relaxed(priv->base + GE_RECV_CONTROL_REG);
+	val |= BIT(3);			/* rx strip pad */
+	val |= BIT(4);			/* run pkt en */
+	writel_relaxed(val, priv->base + GE_RECV_CONTROL_REG);
+
+	/* auto neg control */
+	val = BIT(0);
+	writel_relaxed(val, priv->base + GE_TX_LOCAL_PAGE_REG);
+}
+
+static void hip04_mac_enable(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	/* enable tx & rx */
+	val = readl_relaxed(priv->base + GE_PORT_EN);
+	val |= BIT(1);		/* rx */
+	val |= BIT(2);		/* tx */
+	writel_relaxed(val, priv->base + GE_PORT_EN);
+
+	/* clear rx int */
+	val = RCV_INT;
+	writel_relaxed(val, priv->base + PPE_RINT);
+
+	/* config recv int */
+	val = BIT(6);		/* int threshold 1 package */
+	val |= 0x4;		/* recv timeout */
+	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_INT);
+
+	/* enable interrupt */
+	priv->reg_inten = DEF_INT_MASK;
+	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+}
+
+static void hip04_mac_disable(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 val;
+
+	/* disable int */
+	priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
+	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+
+	/* disable tx & rx */
+	val = readl_relaxed(priv->base + GE_PORT_EN);
+	val &= ~(BIT(1));	/* rx */
+	val &= ~(BIT(2));	/* tx */
+	writel_relaxed(val, priv->base + GE_PORT_EN);
+}
+
+static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	writel(phys, priv->base + PPE_CFG_TX_PKT_BD_ADDR);
+}
+
+static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
+{
+	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, phys);
+}
+
+static u32 hip04_recv_cnt(struct hip04_priv *priv)
+{
+	return readl(priv->base + PPE_HIS_RX_PKT_CNT);
+}
+
+static void hip04_update_mac_address(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+
+	writel_relaxed(((ndev->dev_addr[0] << 8) | (ndev->dev_addr[1])),
+			priv->base + GE_STATION_MAC_ADDRESS);
+	writel_relaxed(((ndev->dev_addr[2] << 24) | (ndev->dev_addr[3] << 16) |
+			(ndev->dev_addr[4] << 8) | (ndev->dev_addr[5])),
+			priv->base + GE_STATION_MAC_ADDRESS + 4);
+}
+
+static int hip04_set_mac_address(struct net_device *ndev, void *addr)
+{
+	eth_mac_addr(ndev, addr);
+	hip04_update_mac_address(ndev);
+	return 0;
+}
+
+static void hip04_tx_reclaim(struct net_device *ndev, bool force)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	unsigned tx_head = priv->tx_head;
+	unsigned tx_tail = priv->tx_tail;
+	struct tx_desc *desc;
+
+	spin_lock_bh(&priv->lock);
+	while ((tx_tail != tx_head) || (priv->tx_count == TX_DESC_NUM)) {
+		desc = &priv->tx_desc[priv->tx_tail];
+		if (desc->send_addr != 0) {
+			if (force)
+				desc->send_addr = 0;
+			else
+				break;
+		}
+		if (priv->tx_phys[tx_tail]) {
+			dma_unmap_single(&ndev->dev, priv->tx_phys[tx_tail],
+				priv->tx_skb[tx_tail]->len, DMA_TO_DEVICE);
+			priv->tx_phys[tx_tail] = 0;
+		}
+		dev_kfree_skb(priv->tx_skb[tx_tail]);
+		priv->tx_skb[tx_tail] = NULL;
+		tx_tail = TX_NEXT(tx_tail);
+		priv->tx_count--;
+	}
+	priv->tx_tail = tx_tail;
+	spin_unlock_bh(&priv->lock);
+
+	if (priv->tx_count)
+		mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
+
+	if (unlikely(netif_queue_stopped(ndev)) &&
+		(priv->tx_count < TX_DESC_NUM))
+		netif_wake_queue(ndev);
+}
+
+static void hip04_xmit_timer(unsigned long data)
+{
+	struct net_device *ndev = (void *) data;
+
+	hip04_tx_reclaim(ndev, false);
+}
+
+static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	unsigned int tx_head = priv->tx_head;
+	struct tx_desc *desc = &priv->tx_desc[tx_head];
+	dma_addr_t phys;
+
+	hip04_tx_reclaim(ndev, false);
+	mod_timer(&priv->txtimer, jiffies + RECLAIM_PERIOD);
+
+	if (priv->tx_count >= TX_DESC_NUM) {
+		netif_stop_queue(ndev);
+		return NETDEV_TX_BUSY;
+	}
+
+	phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&ndev->dev, phys)) {
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	priv->tx_skb[tx_head] = skb;
+	priv->tx_phys[tx_head] = phys;
+	desc->send_addr = cpu_to_be32(phys);
+	desc->send_size = cpu_to_be16(skb->len);
+	desc->cfg = cpu_to_be32(DESC_DEF_CFG);
+	phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
+	desc->wb_addr = cpu_to_be32(phys);
+	skb_tx_timestamp(skb);
+
+	/* Don't wait up for transmitted skbs to be freed. */
+	skb_orphan(skb);
+
+	hip04_set_xmit_desc(priv, phys);
+	priv->tx_head = TX_NEXT(tx_head);
+
+	stats->tx_bytes += skb->len;
+	stats->tx_packets++;
+	priv->tx_count++;
+
+	return NETDEV_TX_OK;
+}
+
+static int hip04_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct hip04_priv *priv = container_of(napi, struct hip04_priv, napi);
+	struct net_device *ndev = priv->ndev;
+	struct net_device_stats *stats = &ndev->stats;
+	unsigned int cnt = hip04_recv_cnt(priv);
+	struct rx_desc *desc;
+	struct sk_buff *skb;
+	unsigned char *buf;
+	bool last = false;
+	dma_addr_t phys;
+	int rx = 0;
+	u16 len;
+	u32 err;
+
+	while (cnt && !last) {
+		buf = priv->rx_buf[priv->rx_head];
+		skb = build_skb(buf, priv->rx_buf_size);
+		if (unlikely(!skb))
+			net_dbg_ratelimited("build_skb failed\n");
+
+		dma_unmap_single(&ndev->dev, priv->rx_phys[priv->rx_head],
+				RX_BUF_SIZE, DMA_FROM_DEVICE);
+		priv->rx_phys[priv->rx_head] = 0;
+
+		desc = (struct rx_desc *)skb->data;
+		len = be16_to_cpu(desc->pkt_len);
+		err = be32_to_cpu(desc->pkt_err);
+
+		if (len > RX_BUF_SIZE)
+			len = RX_BUF_SIZE;
+
+		if (0 == len) {
+			dev_kfree_skb_any(skb);
+			last = true;
+		} else if (err & RX_PKT_ERR) {
+			dev_kfree_skb_any(skb);
+			stats->rx_dropped++;
+			stats->rx_errors++;
+		} else {
+			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+			skb_put(skb, len);
+			skb->protocol = eth_type_trans(skb, ndev);
+			napi_gro_receive(&priv->napi, skb);
+			stats->rx_packets++;
+			stats->rx_bytes += len;
+			rx++;
+		}
+
+		buf = netdev_alloc_frag(priv->rx_buf_size);
+		if (!buf)
+			return -ENOMEM;
+		phys = dma_map_single(&ndev->dev, buf,
+				RX_BUF_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&ndev->dev, phys))
+			return -EIO;
+		priv->rx_buf[priv->rx_head] = buf;
+		priv->rx_phys[priv->rx_head] = phys;
+		hip04_set_recv_desc(priv, phys);
+
+		priv->rx_head = RX_NEXT(priv->rx_head);
+		if (rx >= budget)
+			break;
+
+		if (--cnt == 0)
+			cnt = hip04_recv_cnt(priv);
+	}
+
+	if (rx < budget) {
+		napi_complete(napi);
+
+		/* enable rx interrupt */
+		priv->reg_inten |= RCV_INT | RCV_NOBUF;
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+	}
+
+	return rx;
+}
+
+static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id)
+{
+	struct net_device *ndev = (struct net_device *) dev_id;
+	struct hip04_priv *priv = netdev_priv(ndev);
+	u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
+
+	writel_relaxed(DEF_INT_MASK, priv->base + PPE_RINT);
+
+	if (ists & (RCV_INT | RCV_NOBUF)) {
+		/* disable rx interrupt */
+		priv->reg_inten &= ~(RCV_INT | RCV_NOBUF);
+		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
+		napi_schedule(&priv->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void hip04_adjust_link(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct phy_device *phy = priv->phy;
+
+	if ((priv->speed != phy->speed) || (priv->duplex != phy->duplex)) {
+		hip04_config_port(ndev, phy->speed, phy->duplex);
+		phy_print_status(phy);
+	}
+}
+
+static int hip04_mac_open(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	priv->rx_head = 0;
+	priv->tx_head = 0;
+	priv->tx_tail = 0;
+	priv->tx_count = 0;
+
+	hip04_reset_ppe(priv);
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		dma_addr_t phys;
+
+		phys = dma_map_single(&ndev->dev, priv->rx_buf[i],
+				RX_BUF_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&ndev->dev, phys))
+			return -EIO;
+
+		priv->rx_phys[i] = phys;
+		hip04_set_recv_desc(priv, phys);
+	}
+
+	if (priv->phy)
+		phy_start(priv->phy);
+
+	netif_start_queue(ndev);
+	hip04_mac_enable(ndev);
+	napi_enable(&priv->napi);
+
+	return 0;
+}
+
+static int hip04_mac_stop(struct net_device *ndev)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	napi_disable(&priv->napi);
+	netif_stop_queue(ndev);
+	hip04_mac_disable(ndev);
+	hip04_tx_reclaim(ndev, true);
+	hip04_reset_ppe(priv);
+
+	if (priv->phy)
+		phy_stop(priv->phy);
+
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		if (priv->rx_phys[i]) {
+			dma_unmap_single(&ndev->dev, priv->rx_phys[i],
+					RX_BUF_SIZE, DMA_FROM_DEVICE);
+			priv->rx_phys[i] = 0;
+		}
+	}
+
+	return 0;
+}
+
+static void hip04_timeout(struct net_device *ndev)
+{
+	hip04_mac_stop(ndev);
+	hip04_mac_open(ndev);
+	return;
+}
+
+static struct net_device_ops hip04_netdev_ops = {
+	.ndo_open		= hip04_mac_open,
+	.ndo_stop		= hip04_mac_stop,
+	.ndo_start_xmit		= hip04_mac_start_xmit,
+	.ndo_set_mac_address	= hip04_set_mac_address,
+	.ndo_tx_timeout         = hip04_timeout,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
+static int hip04_alloc_ring(struct net_device *ndev, struct device *d)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	priv->tx_desc = dma_alloc_coherent(d,
+			TX_DESC_NUM * sizeof(struct tx_desc),
+			&priv->tx_desc_dma, GFP_KERNEL);
+	if (!priv->tx_desc)
+		return -ENOMEM;
+
+	priv->rx_buf_size = RX_BUF_SIZE +
+			    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	for (i = 0; i < RX_DESC_NUM; i++) {
+		priv->rx_buf[i] = netdev_alloc_frag(priv->rx_buf_size);
+		if (!priv->rx_buf[i])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hip04_free_ring(struct net_device *ndev, struct device *d)
+{
+	struct hip04_priv *priv = netdev_priv(ndev);
+	int i;
+
+	for (i = 0; i < RX_DESC_NUM; i++)
+		if (priv->rx_buf[i])
+			put_page(virt_to_head_page(priv->rx_buf[i]));
+
+	for (i = 0; i < TX_DESC_NUM; i++)
+		if (priv->tx_skb[i])
+			dev_kfree_skb_any(priv->tx_skb[i]);
+
+	dma_free_coherent(d, TX_DESC_NUM * sizeof(struct tx_desc),
+			priv->tx_desc, priv->tx_desc_dma);
+}
+
+static int hip04_mac_probe(struct platform_device *pdev)
+{
+	struct device *d = &pdev->dev;
+	struct device_node *node = d->of_node;
+	struct of_phandle_args arg;
+	struct net_device *ndev;
+	struct hip04_priv *priv;
+	struct resource *res;
+	unsigned int irq;
+	int ret;
+
+	ndev = alloc_etherdev(sizeof(struct hip04_priv));
+	if (!ndev)
+		return -ENOMEM;
+
+	priv = netdev_priv(ndev);
+	priv->ndev = ndev;
+	platform_set_drvdata(pdev, ndev);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->base = devm_ioremap_resource(d, res);
+	if (IS_ERR(priv->base)) {
+		ret = PTR_ERR(priv->base);
+		goto init_fail;
+	}
+
+	ret = of_parse_phandle_with_fixed_args(node, "port-handle", 2, 0, &arg);
+	if (ret < 0) {
+		dev_warn(d, "no port-handle\n");
+		goto init_fail;
+	}
+
+	priv->port = arg.args[0];
+	priv->chan = arg.args[1];
+
+	priv->map = syscon_node_to_regmap(arg.np);
+	if (IS_ERR(priv->map)) {
+		dev_warn(d, "no syscon hisilicon,hip04-ppe\n");
+		ret = PTR_ERR(priv->map);
+		goto init_fail;
+	}
+
+	priv->phy_mode = of_get_phy_mode(node);
+	if (priv->phy_mode < 0) {
+		dev_warn(d, "not find phy-mode\n");
+		ret = -EINVAL;
+		goto init_fail;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0) {
+		ret = -EINVAL;
+		goto init_fail;
+	}
+
+	ret = devm_request_irq(d, irq, hip04_mac_interrupt,
+				0, pdev->name, ndev);
+	if (ret) {
+		netdev_err(ndev, "devm_request_irq failed\n");
+		goto init_fail;
+	}
+
+	priv->phy_node = of_parse_phandle(node, "phy-handle", 0);
+	if (priv->phy_node) {
+		priv->phy = of_phy_connect(ndev, priv->phy_node,
+			&hip04_adjust_link, 0, priv->phy_mode);
+		if (!priv->phy) {
+			ret = -EPROBE_DEFER;
+			goto init_fail;
+		}
+	}
+
+	ether_setup(ndev);
+	ndev->netdev_ops = &hip04_netdev_ops;
+	ndev->watchdog_timeo = TX_TIMEOUT;
+	ndev->priv_flags |= IFF_UNICAST_FLT;
+	ndev->irq = irq;
+	netif_napi_add(ndev, &priv->napi, hip04_rx_poll, RX_DESC_NUM);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	hip04_reset_ppe(priv);
+	if (priv->phy_mode == PHY_INTERFACE_MODE_MII)
+		hip04_config_port(ndev, SPEED_100, DUPLEX_FULL);
+
+	hip04_config_fifo(priv);
+	random_ether_addr(ndev->dev_addr);
+	hip04_update_mac_address(ndev);
+
+	ret = hip04_alloc_ring(ndev, d);
+	if (ret) {
+		netdev_err(ndev, "alloc ring fail\n");
+		goto alloc_fail;
+	}
+
+	setup_timer(&priv->txtimer, hip04_xmit_timer, (unsigned long) ndev);
+	ret = register_netdev(ndev);
+	if (ret) {
+		free_netdev(ndev);
+		goto alloc_fail;
+	}
+
+	return 0;
+
+alloc_fail:
+	hip04_free_ring(ndev, d);
+init_fail:
+	of_node_put(priv->phy_node);
+	free_netdev(ndev);
+	return ret;
+}
+
+static int hip04_remove(struct platform_device *pdev)
+{
+	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct hip04_priv *priv = netdev_priv(ndev);
+	struct device *d = &pdev->dev;
+
+	if (priv->phy)
+		phy_disconnect(priv->phy);
+
+	del_timer_sync(&priv->txtimer);
+	hip04_free_ring(ndev, d);
+	unregister_netdev(ndev);
+	free_irq(ndev->irq, ndev);
+	of_node_put(priv->phy_node);
+	free_netdev(ndev);
+
+	return 0;
+}
+
+static const struct of_device_id hip04_mac_match[] = {
+	{ .compatible = "hisilicon,hip04-mac" },
+	{ }
+};
+
+static struct platform_driver hip04_mac_driver = {
+	.probe	= hip04_mac_probe,
+	.remove	= hip04_remove,
+	.driver	= {
+		.name		= DRV_NAME,
+		.owner		= THIS_MODULE,
+		.of_match_table	= hip04_mac_match,
+	},
+};
+module_platform_driver(hip04_mac_driver);
+
+MODULE_DESCRIPTION("HISILICON P04 Ethernet driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:hip04-ether");