diff mbox

[net-next] e1000: add initial XDP support

Message ID 20160827071145.25345.84712.stgit@john-Precision-Tower-5810
State Awaiting Upstream, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Aug. 27, 2016, 7:11 a.m. UTC
From: Alexei Starovoitov <ast@fb.com>

This patch adds initial support for XDP on e1000 driver. Note e1000
driver does not support page recycling in general which could be
added as a further improvement. However for XDP_DROP and XDP_XMIT
the xdp code paths will recycle pages.

This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
Brenden Blanco in another pending patch.

  net/mlx4_en: protect ring->xdp_prog with rcu_read_lock

CC: William Tu <u9012063@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |    1 
 drivers/net/ethernet/intel/e1000/e1000_main.c |  168 ++++++++++++++++++++++++-
 2 files changed, 165 insertions(+), 4 deletions(-)

Comments

Or Gerlitz Aug. 28, 2016, 5:55 a.m. UTC | #1
On Sat, Aug 27, 2016 at 10:11 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> From: Alexei Starovoitov <ast@fb.com>

> This patch adds initial support for XDP on e1000 driver. Note e1000
> driver does not support page recycling in general which could be
> added as a further improvement. However for XDP_DROP and XDP_XMIT
> the xdp code paths will recycle pages.

> @@ -4188,15 +4305,57 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                 prefetch(next_rxd);
>
>                 next_buffer = &rx_ring->buffer_info[i];
> -

nit, better to avoid random cleanups in a patch adding new (&& cool)
functionality

>                 cleaned = true;
>                 cleaned_count++;
> +               length = le16_to_cpu(rx_desc->length);
> +
> +               if (prog) {
> +                       struct page *p = buffer_info->rxbuf.page;
> +                       dma_addr_t dma = buffer_info->dma;
> +                       int act;
> +
> +                       if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
> +                               /* attached bpf disallows larger than page
> +                                * packets, so this is hw error or corruption
> +                                */
> +                               pr_info_once("%s buggy !eop\n", netdev->name);
> +                               break;
> +                       }
> +                       if (unlikely(rx_ring->rx_skb_top)) {
> +                               pr_info_once("%s ring resizing bug\n",
> +                                            netdev->name);
> +                               break;
> +                       }
> +                       dma_sync_single_for_cpu(&pdev->dev, dma,
> +                                               length, DMA_FROM_DEVICE);
> +                       act = e1000_call_bpf(prog, page_address(p), length);
> +                       switch (act) {
> +                       case XDP_PASS:
> +                               break;
> +                       case XDP_TX:
> +                               dma_sync_single_for_device(&pdev->dev,
> +                                                          dma,
> +                                                          length,
> +                                                          DMA_TO_DEVICE);
> +                               e1000_xmit_raw_frame(buffer_info, length,
> +                                                    netdev, adapter);
> +                       /* Fallthrough to re-use mappedg page after xmit */

Did you want to say "mapped"? wasn't sure what's the role of "g" @ the end

> +                       case XDP_DROP:
> +                       default:
> +                               /* re-use mapped page. keep buffer_info->dma
> +                                * as-is, so that e1000_alloc_jumbo_rx_buffers
> +                                * only needs to put it back into rx ring
> +                                */

if we're on the XDP_TX pass, don't we need to actually see that frame
has been xmitted
before re using the page?

> +                               total_rx_bytes += length;
> +                               total_rx_packets++;
> +                               goto next_desc;
> +                       }
> +               }
> +
>                 dma_unmap_page(&pdev->dev, buffer_info->dma,
>                                adapter->rx_buffer_len, DMA_FROM_DEVICE);
>                 buffer_info->dma = 0;
Jamal Hadi Salim Aug. 28, 2016, 12:23 p.m. UTC | #2
On 16-08-27 03:11 AM, John Fastabend wrote:
> From: Alexei Starovoitov <ast@fb.com>
>
> This patch adds initial support for XDP on e1000 driver. Note e1000
> driver does not support page recycling in general which could be
> added as a further improvement. However for XDP_DROP and XDP_XMIT
> the xdp code paths will recycle pages.
>
> This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
> Brenden Blanco in another pending patch.
>
>   net/mlx4_en: protect ring->xdp_prog with rcu_read_lock

Do you have any perf numbers of drops of this vs tc drop at ingress?
single or multiple cpus.

cheers,
jamal
William Tu Aug. 28, 2016, 3:56 p.m. UTC | #3
Hi,

Reading through the patch, I found some minor typos below.

On Sat, Aug 27, 2016 at 12:11 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> From: Alexei Starovoitov <ast@fb.com>
>
> This patch adds initial support for XDP on e1000 driver. Note e1000
> driver does not support page recycling in general which could be
> added as a further improvement. However for XDP_DROP and XDP_XMIT

I think you mean XDP_PASS instead of XDP_XMIT?

> the xdp code paths will recycle pages.
>
> This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
> Brenden Blanco in another pending patch.
>
>   net/mlx4_en: protect ring->xdp_prog with rcu_read_lock
>
> CC: William Tu <u9012063@gmail.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |    1
>  drivers/net/ethernet/intel/e1000/e1000_main.c |  168 ++++++++++++++++++++++++-
>  2 files changed, 165 insertions(+), 4 deletions(-)
>
> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> +                                unsigned int len,
> +                                struct net_device *netdev,
> +                                struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_hw *hw = &adapter->hw;
> +       struct e1000_tx_ring *tx_ring;
> +
> +       if (len > E1000_MAX_DATA_PER_TXD)
> +               return;
> +
> +       /* e1000 only support a single txq at the moment so the queue is being
> +        * shared with stack. To support this requires locking to ensure the
> +        * stack and XPD are not running at the same time. Devices would
> +        * multiple queues should allocate a separate queue space.
> +        */

XPD --> XDP
Devices would --> with?

> +       HARD_TX_LOCK(netdev, txq, smp_processor_id());
> +
> +       tx_ring = adapter->tx_ring;
> +
> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
> +               return;
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
> +
> +       HARD_TX_UNLOCK(netdev, txq);
> +}
> +
>  #define NUM_REGS 38 /* 1 based count */
>  static void e1000_regdump(struct e1000_adapter *adapter)
>  {
> @@ -4142,6 +4240,22 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>         return skb;
>  }
>
> +static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
> +                                unsigned int length)
> +{
> +       struct xdp_buff xdp;
> +       int ret;
> +
> +       xdp.data = data;
> +       xdp.data_end = data + length;
> +
> +       rcu_read_lock();
> +       ret = BPF_PROG_RUN(prog, (void *)&xdp);
> +       rcu_read_unlock();
> +
> +       return ret;
> +}
> +
>  /**
>   * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
>   * @adapter: board private structure
> @@ -4160,12 +4274,15 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         struct e1000_rx_desc *rx_desc, *next_rxd;
>         struct e1000_rx_buffer *buffer_info, *next_buffer;
> +       struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
>         int cleaned_count = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>
> +       rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
> +       prog = READ_ONCE(adapter->prog);

If having rcu_read_lock() here, do we still need another in e1000_call_bpf()?


>         i = rx_ring->next_to_clean;
>         rx_desc = E1000_RX_DESC(*rx_ring, i);
>         buffer_info = &rx_ring->buffer_info[i];
> @@ -4188,15 +4305,57 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                 prefetch(next_rxd);
>
>                 next_buffer = &rx_ring->buffer_info[i];
> -
>                 cleaned = true;
>                 cleaned_count++;
> +               length = le16_to_cpu(rx_desc->length);
> +
> +               if (prog) {
> +                       struct page *p = buffer_info->rxbuf.page;
> +                       dma_addr_t dma = buffer_info->dma;
> +                       int act;
> +
> +                       if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
> +                               /* attached bpf disallows larger than page
> +                                * packets, so this is hw error or corruption
> +                                */
> +                               pr_info_once("%s buggy !eop\n", netdev->name);
> +                               break;
> +                       }
> +                       if (unlikely(rx_ring->rx_skb_top)) {
> +                               pr_info_once("%s ring resizing bug\n",
> +                                            netdev->name);
> +                               break;
>
John Fastabend Aug. 29, 2016, 5:33 a.m. UTC | #4
On 16-08-27 10:55 PM, Or Gerlitz wrote:
> On Sat, Aug 27, 2016 at 10:11 AM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> From: Alexei Starovoitov <ast@fb.com>
> 
>> This patch adds initial support for XDP on e1000 driver. Note e1000
>> driver does not support page recycling in general which could be
>> added as a further improvement. However for XDP_DROP and XDP_XMIT
>> the xdp code paths will recycle pages.
> 
>> @@ -4188,15 +4305,57 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>>                 prefetch(next_rxd);
>>
>>                 next_buffer = &rx_ring->buffer_info[i];
>> -
> 
> nit, better to avoid random cleanups in a patch adding new (&& cool)
> functionality
> 

Yep thanks.

[...]

>> +                       case XDP_TX:
>> +                               dma_sync_single_for_device(&pdev->dev,
>> +                                                          dma,
>> +                                                          length,
>> +                                                          DMA_TO_DEVICE);
>> +                               e1000_xmit_raw_frame(buffer_info, length,
>> +                                                    netdev, adapter);
>> +                       /* Fallthrough to re-use mappedg page after xmit */
> 
> Did you want to say "mapped"? wasn't sure what's the role of "g" @ the end

Yep but see below...

> 
>> +                       case XDP_DROP:
>> +                       default:
>> +                               /* re-use mapped page. keep buffer_info->dma
>> +                                * as-is, so that e1000_alloc_jumbo_rx_buffers
>> +                                * only needs to put it back into rx ring
>> +                                */
> 
> if we're on the XDP_TX pass, don't we need to actually see that frame
> has been xmitted
> before re using the page?
> 

Agreed this seems to be too ambitious in the XDP_TX case. Thanks for
the help. Unless Alexei has some reason why it works I'll go ahead and
consume the buffer here.

I think setting

+                               bi->rxbuf.page = NULL;

at the end of the XDP_TX case should fix it but I'll test it again,

Thanks again I guess this is what I get for trying to push patches out
on Friday night.



>> +                               total_rx_bytes += length;
>> +                               total_rx_packets++;
>> +                               goto next_desc;
>> +                       }
>> +               }
>> +
>>                 dma_unmap_page(&pdev->dev, buffer_info->dma,
>>                                adapter->rx_buffer_len, DMA_FROM_DEVICE);
>>                 buffer_info->dma = 0;
John Fastabend Aug. 29, 2016, 5:36 a.m. UTC | #5
On 16-08-28 08:56 AM, William Tu wrote:
> Hi,
> 
> Reading through the patch, I found some minor typos below.
> 
> On Sat, Aug 27, 2016 at 12:11 AM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> From: Alexei Starovoitov <ast@fb.com>
>>
>> This patch adds initial support for XDP on e1000 driver. Note e1000
>> driver does not support page recycling in general which could be
>> added as a further improvement. However for XDP_DROP and XDP_XMIT
> 
> I think you mean XDP_PASS instead of XDP_XMIT?
> 

I really meant XDP_TX but see Or's note and next revision will have
XDP_DROP only here.

>> the xdp code paths will recycle pages.
>>
>> This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
>> Brenden Blanco in another pending patch.
>>
>>   net/mlx4_en: protect ring->xdp_prog with rcu_read_lock
>>
>> CC: William Tu <u9012063@gmail.com>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  drivers/net/ethernet/intel/e1000/e1000.h      |    1
>>  drivers/net/ethernet/intel/e1000/e1000_main.c |  168 ++++++++++++++++++++++++-
>>  2 files changed, 165 insertions(+), 4 deletions(-)
>>
>> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
>> +                                unsigned int len,
>> +                                struct net_device *netdev,
>> +                                struct e1000_adapter *adapter)
>> +{
>> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +       struct e1000_hw *hw = &adapter->hw;
>> +       struct e1000_tx_ring *tx_ring;
>> +
>> +       if (len > E1000_MAX_DATA_PER_TXD)
>> +               return;
>> +
>> +       /* e1000 only support a single txq at the moment so the queue is being
>> +        * shared with stack. To support this requires locking to ensure the
>> +        * stack and XPD are not running at the same time. Devices would
>> +        * multiple queues should allocate a separate queue space.
>> +        */
> 
> XPD --> XDP
> Devices would --> with?

Yep typo.

> 
>> +       HARD_TX_LOCK(netdev, txq, smp_processor_id());
>> +
>> +       tx_ring = adapter->tx_ring;
>> +
>> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
>> +               return;
>> +
>> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
>> +
>> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
>> +
>> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>> +       mmiowb();
>> +
>> +       HARD_TX_UNLOCK(netdev, txq);
>> +}
>> +
>>  #define NUM_REGS 38 /* 1 based count */
>>  static void e1000_regdump(struct e1000_adapter *adapter)
>>  {
>> @@ -4142,6 +4240,22 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>>         return skb;
>>  }
>>
>> +static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
>> +                                unsigned int length)
>> +{
>> +       struct xdp_buff xdp;
>> +       int ret;
>> +
>> +       xdp.data = data;
>> +       xdp.data_end = data + length;
>> +
>> +       rcu_read_lock();
>> +       ret = BPF_PROG_RUN(prog, (void *)&xdp);
>> +       rcu_read_unlock();
>> +
>> +       return ret;
>> +}
>> +
>>  /**
>>   * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
>>   * @adapter: board private structure
>> @@ -4160,12 +4274,15 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>>         struct pci_dev *pdev = adapter->pdev;
>>         struct e1000_rx_desc *rx_desc, *next_rxd;
>>         struct e1000_rx_buffer *buffer_info, *next_buffer;
>> +       struct bpf_prog *prog;
>>         u32 length;
>>         unsigned int i;
>>         int cleaned_count = 0;
>>         bool cleaned = false;
>>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>>
>> +       rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>> +       prog = READ_ONCE(adapter->prog);
> 
> If having rcu_read_lock() here, do we still need another in e1000_call_bpf()?

nope good catch. Thanks for the review!
Jesper Dangaard Brouer Aug. 29, 2016, 8:30 a.m. UTC | #6
On Sun, 28 Aug 2016 08:23:26 -0400
Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> On 16-08-27 03:11 AM, John Fastabend wrote:
> > From: Alexei Starovoitov <ast@fb.com>
> >
> > This patch adds initial support for XDP on e1000 driver. Note e1000
> > driver does not support page recycling in general which could be
> > added as a further improvement. However for XDP_DROP and XDP_XMIT
> > the xdp code paths will recycle pages.
> >
> > This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
> > Brenden Blanco in another pending patch.
> >
> >   net/mlx4_en: protect ring->xdp_prog with rcu_read_lock  
> 
> Do you have any perf numbers of drops of this vs tc drop at ingress?

Hi Jamal,

Can you please provide a simple "tc" command that implements "tc drop"?

Then, I'll add this to the series of tests I'm using for (what I call)
"zoom-in" benchmarking.
Jamal Hadi Salim Aug. 29, 2016, 10:53 a.m. UTC | #7
On 16-08-29 04:30 AM, Jesper Dangaard Brouer wrote:

> Hi Jamal,
>
> Can you please provide a simple "tc" command that implements "tc drop"?
>
> Then, I'll add this to the series of tests I'm using for (what I call)
> "zoom-in" benchmarking.
>

Thanks Jesper.

Something simple since this is done in ingress; lets say drop icmp
packets:

export ETH=eth0
export TC=/sbin/tc
#delete existing ingress qdisc - flushes all filters/actions
sudo $TC qdisc del dev $ETH ingress
#re-add ingress
sudo $TC qdisc add dev $ETH ingress
#
#simple rule to drop all icmp
sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action drop

# other type of filters if you want to compare instead of above
#
# a)drop all
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
u32 match u32 0 0 flowid 1:1 \
action drop
#b) drop if src is XXX
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
u32 match ip src 192.168.100.1 flowid 1:1 \
action drop

If you can do one core vs many cores (XDP should probably do very well
in multi-core)

I think given how ancient the e1000 is we may see the driver being
a contributing overhead. I believe XDP given location will do
well - but for this kind of driver my gut feeling is probably not
by large margin.

cheers,
jamal
Jesper Dangaard Brouer Aug. 29, 2016, 1:39 p.m. UTC | #8
On Mon, 29 Aug 2016 06:53:53 -0400
Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> On 16-08-29 04:30 AM, Jesper Dangaard Brouer wrote:
> 
> > Hi Jamal,
> >
> > Can you please provide a simple "tc" command that implements "tc drop"?
> >
> > Then, I'll add this to the series of tests I'm using for (what I call)
> > "zoom-in" benchmarking.
> >  
> 
> Thanks Jesper.

I've created a script called tc_ingress_drop.sh[1] which uses the
commands you provided below.  Now people can easily use this script to
perform the benchmark you were requesting ;-)

[1] https://github.com/netoptimizer/network-testing/blob/master/bin/tc_ingress_drop.sh

Example to enable dropping:

 $ ./tc_ingress_drop.sh --dev mlx5p2 --verbose
 # (Not root, running with sudo)
 # Flush existing ingress qdisc on device :mlx5p2
 tc qdisc del dev mlx5p2 ingress
 tc qdisc add dev mlx5p2 ingress
 # Simply drop all ingress packets on device: mlx5p2
 tc filter add dev mlx5p2 parent ffff: prio 2 protocol ip u32 match u32 0 0 flowid 1:1 action drop

Example to disable again:
 ./tc_ingress_drop.sh --dev mlx5p2 --flush

 
> Something simple since this is done in ingress; lets say drop icmp
> packets:
> 
> export ETH=eth0
> export TC=/sbin/tc
> #delete existing ingress qdisc - flushes all filters/actions
> sudo $TC qdisc del dev $ETH ingress
> #re-add ingress
> sudo $TC qdisc add dev $ETH ingress
> #
> #simple rule to drop all icmp
> sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
> u32 match ip protocol 1 0xff flowid 1:1 \
> action drop
> 
> # other type of filters if you want to compare instead of above
> #
> # a)drop all
> sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
> u32 match u32 0 0 flowid 1:1 \
> action drop
> #b) drop if src is XXX
> sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
> u32 match ip src 192.168.100.1 flowid 1:1 \
> action drop
>
Jesper Dangaard Brouer Aug. 29, 2016, 3:55 p.m. UTC | #9
Hi Jamal,

I'm adding: drop a specific UDP port option to my script... But I does
not match/drop the packets, command below does apply, but it does not
work in practice

$ ./tc_ingress_drop.sh --verbose --dev mlx5p2 --port 9
tc qdisc del dev mlx5p2 ingress
tc qdisc add dev mlx5p2 ingress
tc filter add dev mlx5p2 parent ffff: prio 4 protocol ip u32 match ip protocol 17 0xff match udp dst 9 0xffff flowid 1:1 action drop

(Use-case is obviously to drop pktgen UDP packets.)

I also tried with:

 tc filter add dev mlx5p2 parent ffff: prio 4 protocol ip \
  u32 \
  match udp dst 9 0xffff \
  match ip protocol 17 0xff flowid 1:1 action drop

--Jesper
(top post)

On Mon, 29 Aug 2016 15:39:05 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Mon, 29 Aug 2016 06:53:53 -0400
> Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> 
> > On 16-08-29 04:30 AM, Jesper Dangaard Brouer wrote:
> >   
> > > Hi Jamal,
> > >
> > > Can you please provide a simple "tc" command that implements "tc drop"?
> > >
> > > Then, I'll add this to the series of tests I'm using for (what I call)
> > > "zoom-in" benchmarking.
> > >    
> > 
> > Thanks Jesper.  
> 
> I've created a script called tc_ingress_drop.sh[1] which uses the
> commands you provided below.  Now people can easily use this script to
> perform the benchmark you were requesting ;-)
> 
> [1] https://github.com/netoptimizer/network-testing/blob/master/bin/tc_ingress_drop.sh
> 
> Example to enable dropping:
> 
>  $ ./tc_ingress_drop.sh --dev mlx5p2 --verbose
>  # (Not root, running with sudo)
>  # Flush existing ingress qdisc on device :mlx5p2
>  tc qdisc del dev mlx5p2 ingress
>  tc qdisc add dev mlx5p2 ingress
>  # Simply drop all ingress packets on device: mlx5p2
>  tc filter add dev mlx5p2 parent ffff: prio 2 protocol ip u32 match u32 0 0 flowid 1:1 action drop
> 
> Example to disable again:
>  ./tc_ingress_drop.sh --dev mlx5p2 --flush
> 
>  
> > Something simple since this is done in ingress; lets say drop icmp
> > packets:
> > 
> > export ETH=eth0
> > export TC=/sbin/tc
> > #delete existing ingress qdisc - flushes all filters/actions
> > sudo $TC qdisc del dev $ETH ingress
> > #re-add ingress
> > sudo $TC qdisc add dev $ETH ingress
> > #
> > #simple rule to drop all icmp
> > sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
> > u32 match ip protocol 1 0xff flowid 1:1 \
> > action drop
> > 
> > # other type of filters if you want to compare instead of above
> > #
> > # a)drop all
> > sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
> > u32 match u32 0 0 flowid 1:1 \
> > action drop
> > #b) drop if src is XXX
> > sudo $TC filter add dev $ETH parent ffff: prio 2 protocol ip \
> > u32 match ip src 192.168.100.1 flowid 1:1 \
> > action drop
> >
Jamal Hadi Salim Aug. 30, 2016, 12:13 p.m. UTC | #10
On 16-08-29 11:55 AM, Jesper Dangaard Brouer wrote:
> tc filter add dev mlx5p2 parent ffff: prio 4 protocol ip u32 match ip protocol 17 0xff match udp dst 9 0xffff flowid 1:1 action

Syntax is a little more convoluted  than that ;->. Try:

sudo tc filter add dev eth0 parent ffff: prio 4 protocol ip u32 \
match ip protocol 17 0xff \
match ip dport 1900 0xffff \
flowid 1:1 \
action drop

Note, this will be more cycles than drop all.

cheers,
jamal
Jesper Dangaard Brouer Aug. 30, 2016, 1:31 p.m. UTC | #11
On Tue, 30 Aug 2016 08:13:15 -0400 Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> On 16-08-29 11:55 AM, Jesper Dangaard Brouer wrote:
> > tc filter add dev mlx5p2 parent ffff: prio 4 protocol ip u32 match ip protocol 17 0xff match udp dst 9 0xffff flowid 1:1 action  
> 
> Syntax is a little more convoluted  than that ;->. Try:
> 
> sudo tc filter add dev eth0 parent ffff: prio 4 protocol ip u32 \
> match ip protocol 17 0xff \
> match ip dport 1900 0xffff \
> flowid 1:1 \
> action drop

I think I figured out why, match "udp dst" does not work.  It seems to
depend on "nexthdr+0" which is an implicit variable, that for unknown
reasons are not set in my original rule (above).

Before you suggestion I managed to match the udp port by manually
defining the offset, assuming an IP-header is 20 bytes (no-options),
like:

tc filter add dev $device parent ffff: prio 4 protocol ip \
	u32 \
	match ip protocol 17 0xff \
	match udp dst $udp_port 0xffff at 21\
	flowid 1:1 \
	action drop

You solution with "ip dport" also works, but man[1] tc-u32(8) also have
a warning about "ip dport" size assumptions...

Updated my script to use "u32 match ip port":
 https://github.com/netoptimizer/network-testing/commit/6449f6beb4d2

> Note, this will be more cycles than drop all.

Yes, that is the point ;-) XDP also does header parsing...
John Fastabend Sept. 1, 2016, 7:33 p.m. UTC | #12
[...]

> I think given how ancient the e1000 is we may see the driver being
> a contributing overhead. I believe XDP given location will do
> well - but for this kind of driver my gut feeling is probably not
> by large margin.
> 

Right so just ran the baseline, xdp, tc spread and its all more or
less in the noise where you drop the packets doesn't matter much. At
least in my setup where I'm running e1000 in a VM backed by a tap
device. I don't have a physical e1000 in my system at the moment to
test.

I still think this code is valuable though because it lets me run
the same XDP program in a e1000 VM that I'm running on my 10/40Gbps
NIC. Also it gives me a nice test platform to work on.

I'm going to resubmit the patch with a couple of the fixes pointed out
by others.

Thanks,
John
John Fastabend Sept. 1, 2016, 9:35 p.m. UTC | #13
On 16-08-30 06:31 AM, Jesper Dangaard Brouer wrote:
> On Tue, 30 Aug 2016 08:13:15 -0400 Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> 
>> On 16-08-29 11:55 AM, Jesper Dangaard Brouer wrote:
>>> tc filter add dev mlx5p2 parent ffff: prio 4 protocol ip u32 match ip protocol 17 0xff match udp dst 9 0xffff flowid 1:1 action  
>>
>> Syntax is a little more convoluted  than that ;->. Try:
>>
>> sudo tc filter add dev eth0 parent ffff: prio 4 protocol ip u32 \
>> match ip protocol 17 0xff \
>> match ip dport 1900 0xffff \
>> flowid 1:1 \
>> action drop
> 
> I think I figured out why, match "udp dst" does not work.  It seems to
> depend on "nexthdr+0" which is an implicit variable, that for unknown
> reasons are not set in my original rule (above).
> 
> Before you suggestion I managed to match the udp port by manually
> defining the offset, assuming an IP-header is 20 bytes (no-options),
> like:
> 
> tc filter add dev $device parent ffff: prio 4 protocol ip \
> 	u32 \
> 	match ip protocol 17 0xff \
> 	match udp dst $udp_port 0xffff at 21\
> 	flowid 1:1 \
> 	action drop
> 
> You solution with "ip dport" also works, but man[1] tc-u32(8) also have
> a warning about "ip dport" size assumptions...
> 
> Updated my script to use "u32 match ip port":
>  https://github.com/netoptimizer/network-testing/commit/6449f6beb4d2
> 

FWIW the 'udp dst' notation is quit fragile in that it only reads an
offset into the packet where a udp dst port might be. More robust
solutions require the use of links.

I have a wrapper tool around the 'link' creation part of u32 that we
can probably show off at netconf. :)


>> Note, this will be more cycles than drop all.
> 
> Yes, that is the point ;-) XDP also does header parsing...
>
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index d7bdea7..62e33fa 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -279,6 +279,7 @@  struct e1000_adapter {
 			     struct e1000_rx_ring *rx_ring,
 			     int cleaned_count);
 	struct e1000_rx_ring *rx_ring;      /* One per active queue */
+	struct bpf_prog *prog;
 	struct napi_struct napi;
 
 	int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index f42129d..396b0e0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -32,6 +32,7 @@ 
 #include <linux/prefetch.h>
 #include <linux/bitops.h>
 #include <linux/if_vlan.h>
+#include <linux/bpf.h>
 
 char e1000_driver_name[] = "e1000";
 static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
@@ -842,6 +843,44 @@  static int e1000_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+	struct e1000_adapter *adapter = netdev_priv(netdev);
+	struct bpf_prog *old_prog;
+
+	old_prog = xchg(&adapter->prog, prog);
+	if (old_prog) {
+		synchronize_net();
+		bpf_prog_put(old_prog);
+	}
+
+	if (netif_running(netdev))
+		e1000_reinit_locked(adapter);
+	else
+		e1000_reset(adapter);
+	return 0;
+}
+
+static bool e1000_xdp_attached(struct net_device *dev)
+{
+	struct e1000_adapter *priv = netdev_priv(dev);
+
+	return !!priv->prog;
+}
+
+static int e1000_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return e1000_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = e1000_xdp_attached(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops e1000_netdev_ops = {
 	.ndo_open		= e1000_open,
 	.ndo_stop		= e1000_close,
@@ -860,6 +899,7 @@  static const struct net_device_ops e1000_netdev_ops = {
 #endif
 	.ndo_fix_features	= e1000_fix_features,
 	.ndo_set_features	= e1000_set_features,
+	.ndo_xdp		= e1000_xdp,
 };
 
 /**
@@ -1276,6 +1316,9 @@  static void e1000_remove(struct pci_dev *pdev)
 	e1000_down_and_stop(adapter);
 	e1000_release_manageability(adapter);
 
+	if (adapter->prog)
+		bpf_prog_put(adapter->prog);
+
 	unregister_netdev(netdev);
 
 	e1000_phy_hw_reset(hw);
@@ -1859,7 +1902,7 @@  static void e1000_configure_rx(struct e1000_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 	u32 rdlen, rctl, rxcsum;
 
-	if (adapter->netdev->mtu > ETH_DATA_LEN) {
+	if (adapter->netdev->mtu > ETH_DATA_LEN || adapter->prog) {
 		rdlen = adapter->rx_ring[0].count *
 			sizeof(struct e1000_rx_desc);
 		adapter->clean_rx = e1000_clean_jumbo_rx_irq;
@@ -3298,6 +3341,61 @@  static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
+static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
+				struct e1000_rx_buffer *rx_buffer_info,
+				unsigned int len)
+{
+	struct e1000_tx_buffer *buffer_info;
+	unsigned int i = tx_ring->next_to_use;
+
+	buffer_info = &tx_ring->buffer_info[i];
+
+	buffer_info->length = len;
+	buffer_info->time_stamp = jiffies;
+	buffer_info->mapped_as_page = false;
+	buffer_info->dma = rx_buffer_info->dma;
+	buffer_info->next_to_watch = i;
+
+	tx_ring->buffer_info[i].skb = NULL;
+	tx_ring->buffer_info[i].segs = 1;
+	tx_ring->buffer_info[i].bytecount = len;
+	tx_ring->buffer_info[i].next_to_watch = i;
+}
+
+static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
+				 unsigned int len,
+				 struct net_device *netdev,
+				 struct e1000_adapter *adapter)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
+	struct e1000_hw *hw = &adapter->hw;
+	struct e1000_tx_ring *tx_ring;
+
+	if (len > E1000_MAX_DATA_PER_TXD)
+		return;
+
+	/* e1000 only support a single txq at the moment so the queue is being
+	 * shared with stack. To support this requires locking to ensure the
+	 * stack and XPD are not running at the same time. Devices would
+	 * multiple queues should allocate a separate queue space.
+	 */
+	HARD_TX_LOCK(netdev, txq, smp_processor_id());
+
+	tx_ring = adapter->tx_ring;
+
+	if (E1000_DESC_UNUSED(tx_ring) < 2)
+		return;
+
+	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
+
+	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+
+	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
+	mmiowb();
+
+	HARD_TX_UNLOCK(netdev, txq);
+}
+
 #define NUM_REGS 38 /* 1 based count */
 static void e1000_regdump(struct e1000_adapter *adapter)
 {
@@ -4142,6 +4240,22 @@  static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
 	return skb;
 }
 
+static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
+				 unsigned int length)
+{
+	struct xdp_buff xdp;
+	int ret;
+
+	xdp.data = data;
+	xdp.data_end = data + length;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, (void *)&xdp);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /**
  * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
  * @adapter: board private structure
@@ -4160,12 +4274,15 @@  static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 	struct pci_dev *pdev = adapter->pdev;
 	struct e1000_rx_desc *rx_desc, *next_rxd;
 	struct e1000_rx_buffer *buffer_info, *next_buffer;
+	struct bpf_prog *prog;
 	u32 length;
 	unsigned int i;
 	int cleaned_count = 0;
 	bool cleaned = false;
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 
+	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
+	prog = READ_ONCE(adapter->prog);
 	i = rx_ring->next_to_clean;
 	rx_desc = E1000_RX_DESC(*rx_ring, i);
 	buffer_info = &rx_ring->buffer_info[i];
@@ -4188,15 +4305,57 @@  static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 		prefetch(next_rxd);
 
 		next_buffer = &rx_ring->buffer_info[i];
-
 		cleaned = true;
 		cleaned_count++;
+		length = le16_to_cpu(rx_desc->length);
+
+		if (prog) {
+			struct page *p = buffer_info->rxbuf.page;
+			dma_addr_t dma = buffer_info->dma;
+			int act;
+
+			if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
+				/* attached bpf disallows larger than page
+				 * packets, so this is hw error or corruption
+				 */
+				pr_info_once("%s buggy !eop\n", netdev->name);
+				break;
+			}
+			if (unlikely(rx_ring->rx_skb_top)) {
+				pr_info_once("%s ring resizing bug\n",
+					     netdev->name);
+				break;
+			}
+			dma_sync_single_for_cpu(&pdev->dev, dma,
+						length, DMA_FROM_DEVICE);
+			act = e1000_call_bpf(prog, page_address(p), length);
+			switch (act) {
+			case XDP_PASS:
+				break;
+			case XDP_TX:
+				dma_sync_single_for_device(&pdev->dev,
+							   dma,
+							   length,
+							   DMA_TO_DEVICE);
+				e1000_xmit_raw_frame(buffer_info, length,
+						     netdev, adapter);
+			/* Fallthrough to re-use mappedg page after xmit */
+			case XDP_DROP:
+			default:
+				/* re-use mapped page. keep buffer_info->dma
+				 * as-is, so that e1000_alloc_jumbo_rx_buffers
+				 * only needs to put it back into rx ring
+				 */
+				total_rx_bytes += length;
+				total_rx_packets++;
+				goto next_desc;
+			}
+		}
+
 		dma_unmap_page(&pdev->dev, buffer_info->dma,
 			       adapter->rx_buffer_len, DMA_FROM_DEVICE);
 		buffer_info->dma = 0;
 
-		length = le16_to_cpu(rx_desc->length);
-
 		/* errors is only valid for DD + EOP descriptors */
 		if (unlikely((status & E1000_RXD_STAT_EOP) &&
 		    (rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK))) {
@@ -4330,6 +4489,7 @@  next_desc:
 		rx_desc = next_rxd;
 		buffer_info = next_buffer;
 	}
+	rcu_read_unlock();
 	rx_ring->next_to_clean = i;
 
 	cleaned_count = E1000_DESC_UNUSED(rx_ring);