diff mbox

[net-next,v2,2/2] e1000: bundle xdp xmit routines

Message ID 20160909212938.4001.40540.stgit@john-Precision-Tower-5810
State Awaiting Upstream, archived
Delegated to: David Miller
Headers show

Commit Message

John Fastabend Sept. 9, 2016, 9:29 p.m. UTC
e1000 supports a single TX queue so it is being shared with the stack
when XDP runs XDP_TX action. This requires taking the xmit lock to
ensure we don't corrupt the tx ring. To avoid taking and dropping the
lock per packet this patch adds a bundling implementation to submit
a bundle of packets to the xmit routine.

I tested this patch running e1000 in a VM using KVM over a tap
device using pktgen to generate traffic along with 'ping -f -l 100'.

Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
 drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
 2 files changed, 71 insertions(+), 20 deletions(-)

Comments

John Fastabend Sept. 9, 2016, 11:37 p.m. UTC | #1
On 16-09-09 02:29 PM, John Fastabend wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
> 
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
> 
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---

This patch is a bit bogus in a few spots as well...


> -
> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -		HARD_TX_UNLOCK(netdev, txq);
> -		return;
> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
> +				     buffer_info[i].length,
> +				     adapter, tx_ring);
> +		buffer_info[i].buffer->rxbuf.page = NULL;
> +		buffer_info[i].buffer = NULL;
> +		buffer_info[i].length = 0;
> +		i++;
              ^^^^^^^^
Tom Herbert Sept. 9, 2016, 11:44 p.m. UTC | #2
On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
>
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
>
Hi John,

How does this interact with BQL on e1000?

Tom

> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
>  drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
>  2 files changed, 71 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index 5cf8a0a..877b377 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -133,6 +133,8 @@ struct e1000_adapter;
>  #define E1000_TX_QUEUE_WAKE    16
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
>  #define E1000_RX_BUFFER_WRITE  16 /* Must be power of 2 */
> +/* How many XDP XMIT buffers to bundle into one xmit transaction */
> +#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
>
>  #define AUTO_ALL_MODES         0
>  #define E1000_EEPROM_82544_APM 0x0004
> @@ -168,6 +170,11 @@ struct e1000_rx_buffer {
>         dma_addr_t dma;
>  };
>
> +struct e1000_rx_buffer_bundle {
> +       struct e1000_rx_buffer *buffer;
> +       u32 length;
> +};
> +
>  struct e1000_tx_ring {
>         /* pointer to the descriptor ring memory */
>         void *desc;
> @@ -206,6 +213,9 @@ struct e1000_rx_ring {
>         struct e1000_rx_buffer *buffer_info;
>         struct sk_buff *rx_skb_top;
>
> +       /* array of XDP buffer information structs */
> +       struct e1000_rx_buffer_bundle *xdp_buffer;
> +
>         /* cpu for rx queue */
>         int cpu;
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         int size, desc_len;
>
> +       size = sizeof(struct e1000_rx_buffer_bundle) *
> +                       E1000_XDP_XMIT_BUNDLE_MAX;
> +       rxdr->xdp_buffer = vzalloc(size);
> +       if (!rxdr->xdp_buffer)
> +               return -ENOMEM;
> +
>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>         rxdr->buffer_info = vzalloc(size);
> -       if (!rxdr->buffer_info)
> +       if (!rxdr->buffer_info) {
> +               vfree(rxdr->xdp_buffer);
>                 return -ENOMEM;
> +       }
>
>         desc_len = sizeof(struct e1000_rx_desc);
>
> @@ -1754,6 +1762,7 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>                                         GFP_KERNEL);
>         if (!rxdr->desc) {
>  setup_rx_desc_die:
> +               vfree(rxdr->xdp_buffer);
>                 vfree(rxdr->buffer_info);
>                 return -ENOMEM;
>         }
> @@ -2087,6 +2096,9 @@ static void e1000_free_rx_resources(struct e1000_adapter *adapter,
>
>         e1000_clean_rx_ring(adapter, rx_ring);
>
> +       vfree(rx_ring->xdp_buffer);
> +       rx_ring->xdp_buffer = NULL;
> +
>         vfree(rx_ring->buffer_info);
>         rx_ring->buffer_info = NULL;
>
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -                                unsigned int len,
> -                                struct net_device *netdev,
> -                                struct e1000_adapter *adapter)
> +                                __u32 len,
> +                                struct e1000_adapter *adapter,
> +                                struct e1000_tx_ring *tx_ring)
>  {
> -       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -       struct e1000_hw *hw = &adapter->hw;
> -       struct e1000_tx_ring *tx_ring;
> -
>         if (len > E1000_MAX_DATA_PER_TXD)
>                 return;
>
> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
> +               return;
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +                                 struct net_device *netdev,
> +                                 struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +       struct e1000_hw *hw = &adapter->hw;
> +       int i = 0;
> +
>         /* e1000 only support a single txq at the moment so the queue is being
>          * shared with stack. To support this requires locking to ensure the
>          * stack and XDP are not running at the same time. Devices with
>          * multiple queues should allocate a separate queue space.
> +        *
> +        * To amortize the locking cost e1000 bundles the xmits and sends as
> +        * many as possible until either running out of descriptors or failing.
>          */
>         HARD_TX_LOCK(netdev, txq, smp_processor_id());
>
> -       tx_ring = adapter->tx_ring;
> -
> -       if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -               HARD_TX_UNLOCK(netdev, txq);
> -               return;
> +       for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +               e1000_xmit_raw_frame(buffer_info[i].buffer,
> +                                    buffer_info[i].length,
> +                                    adapter, tx_ring);
> +               buffer_info[i].buffer->rxbuf.page = NULL;
> +               buffer_info[i].buffer = NULL;
> +               buffer_info[i].length = 0;
> +               i++;
>         }
>
> -       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +       /* kick hardware to send bundle and return control back to the stack */
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
>
>         writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>         mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
> -       int cleaned_count = 0;
> +       int cleaned_count = 0, xdp_xmit = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +       struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>
>         rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>         prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                         case XDP_PASS:
>                                 break;
>                         case XDP_TX:
> +                               xdp_bundle[xdp_xmit].buffer = buffer_info;
> +                               xdp_bundle[xdp_xmit].length = length;
>                                 dma_sync_single_for_device(&pdev->dev,
>                                                            dma,
>                                                            length,
>                                                            DMA_TO_DEVICE);
> -                               e1000_xmit_raw_frame(buffer_info, length,
> -                                                    netdev, adapter);
> -                               buffer_info->rxbuf.page = NULL;
> +                               xdp_xmit++;
>                         case XDP_DROP:
>                         default:
>                                 /* re-use mapped page. keep buffer_info->dma
> @@ -4486,8 +4518,14 @@ next_desc:
>
>                 /* return some buffers to hardware, one at a time is too slow */
>                 if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
> +                       if (xdp_xmit)
> +                               e1000_xdp_xmit_bundle(xdp_bundle,
> +                                                     netdev,
> +                                                     adapter);
> +
>                         adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>                         cleaned_count = 0;
> +                       xdp_xmit = 0;
>                 }
>
>                 /* use prefetched values */
> @@ -4498,8 +4536,11 @@ next_desc:
>         rx_ring->next_to_clean = i;
>
>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
> -       if (cleaned_count)
> +       if (cleaned_count) {
> +               if (xdp_xmit)
> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
> +       }
>
>         adapter->total_rx_packets += total_rx_packets;
>         adapter->total_rx_bytes += total_rx_bytes;
>
John Fastabend Sept. 10, 2016, 12:01 a.m. UTC | #3
On 16-09-09 04:44 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
> Hi John,
> 
> How does this interact with BQL on e1000?
> 
> Tom
> 

Let me check if I have the API correct. When we enqueue a packet to
be sent we must issue a netdev_sent_queue() call and then on actual
transmission issue a netdev_completed_queue().

The patch attached here missed a few things though.

But it looks like I just need to call netdev_sent_queue() from the
e1000_xmit_raw_frame() routine and then let the tx completion logic
kick in which will call netdev_completed_queue() correctly.

I'll need to add a check for the queue state as well. So if I do these
three things,

	check __QUEUE_STATE_XOFF before sending
	netdev_sent_queue() -> on XDP_TX
	netdev_completed_queue()

It should work agree? Now should we do this even when XDP owns the
queue? Or is this purely an issue with sharing the queue between
XDP and stack.

.John
Tom Herbert Sept. 10, 2016, 1:04 a.m. UTC | #4
On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 04:44 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>> e1000 supports a single TX queue so it is being shared with the stack
>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>> lock per packet this patch adds a bundling implementation to submit
>>> a bundle of packets to the xmit routine.
>>>
>>> I tested this patch running e1000 in a VM using KVM over a tap
>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>
>> Hi John,
>>
>> How does this interact with BQL on e1000?
>>
>> Tom
>>
>
> Let me check if I have the API correct. When we enqueue a packet to
> be sent we must issue a netdev_sent_queue() call and then on actual
> transmission issue a netdev_completed_queue().
>
> The patch attached here missed a few things though.
>
> But it looks like I just need to call netdev_sent_queue() from the
> e1000_xmit_raw_frame() routine and then let the tx completion logic
> kick in which will call netdev_completed_queue() correctly.
>
> I'll need to add a check for the queue state as well. So if I do these
> three things,
>
>         check __QUEUE_STATE_XOFF before sending
>         netdev_sent_queue() -> on XDP_TX
>         netdev_completed_queue()
>
> It should work agree? Now should we do this even when XDP owns the
> queue? Or is this purely an issue with sharing the queue between
> XDP and stack.
>
But what is the action for XDP_TX if the queue is stopped? There is no
qdisc to back pressure in the XDP path. Would we just start dropping
packets then?

Tom

> .John
>
John Fastabend Sept. 10, 2016, 1:12 a.m. UTC | #5
On 16-09-09 06:04 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>> lock per packet this patch adds a bundling implementation to submit
>>>> a bundle of packets to the xmit routine.
>>>>
>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>
>>> Hi John,
>>>
>>> How does this interact with BQL on e1000?
>>>
>>> Tom
>>>
>>
>> Let me check if I have the API correct. When we enqueue a packet to
>> be sent we must issue a netdev_sent_queue() call and then on actual
>> transmission issue a netdev_completed_queue().
>>
>> The patch attached here missed a few things though.
>>
>> But it looks like I just need to call netdev_sent_queue() from the
>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>> kick in which will call netdev_completed_queue() correctly.
>>
>> I'll need to add a check for the queue state as well. So if I do these
>> three things,
>>
>>         check __QUEUE_STATE_XOFF before sending
>>         netdev_sent_queue() -> on XDP_TX
>>         netdev_completed_queue()
>>
>> It should work agree? Now should we do this even when XDP owns the
>> queue? Or is this purely an issue with sharing the queue between
>> XDP and stack.
>>
> But what is the action for XDP_TX if the queue is stopped? There is no
> qdisc to back pressure in the XDP path. Would we just start dropping
> packets then?

Yep that is what the patch does if there is any sort of error packets
get dropped on the floor. I don't think there is anything else that
can be done.

> 
> Tom
> 
>> .John
>>
Tom Herbert Sept. 10, 2016, 1:19 a.m. UTC | #6
On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 06:04 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>> a bundle of packets to the xmit routine.
>>>>>
>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>
>>>> Hi John,
>>>>
>>>> How does this interact with BQL on e1000?
>>>>
>>>> Tom
>>>>
>>>
>>> Let me check if I have the API correct. When we enqueue a packet to
>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>> transmission issue a netdev_completed_queue().
>>>
>>> The patch attached here missed a few things though.
>>>
>>> But it looks like I just need to call netdev_sent_queue() from the
>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>> kick in which will call netdev_completed_queue() correctly.
>>>
>>> I'll need to add a check for the queue state as well. So if I do these
>>> three things,
>>>
>>>         check __QUEUE_STATE_XOFF before sending
>>>         netdev_sent_queue() -> on XDP_TX
>>>         netdev_completed_queue()
>>>
>>> It should work agree? Now should we do this even when XDP owns the
>>> queue? Or is this purely an issue with sharing the queue between
>>> XDP and stack.
>>>
>> But what is the action for XDP_TX if the queue is stopped? There is no
>> qdisc to back pressure in the XDP path. Would we just start dropping
>> packets then?
>
> Yep that is what the patch does if there is any sort of error packets
> get dropped on the floor. I don't think there is anything else that
> can be done.
>
That probably means that the stack will always win out under load.
Trying to used the same queue where half of the packets are well
managed by a qdisc and half aren't is going to leave someone unhappy.
Maybe in the this case where we have to share the qdisc we can
allocate the skb on on returning XDP_TX and send through the normal
qdisc for the device.

Tom

>>
>> Tom
>>
>>> .John
>>>
>
Alexei Starovoitov Sept. 10, 2016, 1:40 a.m. UTC | #7
On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> > On 16-09-09 06:04 PM, Tom Herbert wrote:
> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:
> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> >>>>> e1000 supports a single TX queue so it is being shared with the stack
> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> >>>>> lock per packet this patch adds a bundling implementation to submit
> >>>>> a bundle of packets to the xmit routine.
> >>>>>
> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
> >>>>>
> >>>> Hi John,
> >>>>
> >>>> How does this interact with BQL on e1000?
> >>>>
> >>>> Tom
> >>>>
> >>>
> >>> Let me check if I have the API correct. When we enqueue a packet to
> >>> be sent we must issue a netdev_sent_queue() call and then on actual
> >>> transmission issue a netdev_completed_queue().
> >>>
> >>> The patch attached here missed a few things though.
> >>>
> >>> But it looks like I just need to call netdev_sent_queue() from the
> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
> >>> kick in which will call netdev_completed_queue() correctly.
> >>>
> >>> I'll need to add a check for the queue state as well. So if I do these
> >>> three things,
> >>>
> >>>         check __QUEUE_STATE_XOFF before sending
> >>>         netdev_sent_queue() -> on XDP_TX
> >>>         netdev_completed_queue()
> >>>
> >>> It should work agree? Now should we do this even when XDP owns the
> >>> queue? Or is this purely an issue with sharing the queue between
> >>> XDP and stack.
> >>>
> >> But what is the action for XDP_TX if the queue is stopped? There is no
> >> qdisc to back pressure in the XDP path. Would we just start dropping
> >> packets then?
> >
> > Yep that is what the patch does if there is any sort of error packets
> > get dropped on the floor. I don't think there is anything else that
> > can be done.
> >
> That probably means that the stack will always win out under load.
> Trying to used the same queue where half of the packets are well
> managed by a qdisc and half aren't is going to leave someone unhappy.
> Maybe in the this case where we have to share the qdisc we can
> allocate the skb on on returning XDP_TX and send through the normal
> qdisc for the device.

I wouldn't go to such extremes for e1k.
The only reason to have xdp in e1k is to use it for testing
of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
Existing stack with skb is perfectly fine as it is.
No need to do recycling, batching or any other complex things.
xdp for e1k cannot be used as an example for other drivers either,
since there is only one tx ring and any high performance adapter
has more which makes the driver support quite different.
Tom Herbert Sept. 10, 2016, 3:12 a.m. UTC | #8
On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> > On 16-09-09 06:04 PM, Tom Herbert wrote:
>> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> >>>>> e1000 supports a single TX queue so it is being shared with the stack
>> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> >>>>> lock per packet this patch adds a bundling implementation to submit
>> >>>>> a bundle of packets to the xmit routine.
>> >>>>>
>> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
>> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>> >>>>>
>> >>>> Hi John,
>> >>>>
>> >>>> How does this interact with BQL on e1000?
>> >>>>
>> >>>> Tom
>> >>>>
>> >>>
>> >>> Let me check if I have the API correct. When we enqueue a packet to
>> >>> be sent we must issue a netdev_sent_queue() call and then on actual
>> >>> transmission issue a netdev_completed_queue().
>> >>>
>> >>> The patch attached here missed a few things though.
>> >>>
>> >>> But it looks like I just need to call netdev_sent_queue() from the
>> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>> >>> kick in which will call netdev_completed_queue() correctly.
>> >>>
>> >>> I'll need to add a check for the queue state as well. So if I do these
>> >>> three things,
>> >>>
>> >>>         check __QUEUE_STATE_XOFF before sending
>> >>>         netdev_sent_queue() -> on XDP_TX
>> >>>         netdev_completed_queue()
>> >>>
>> >>> It should work agree? Now should we do this even when XDP owns the
>> >>> queue? Or is this purely an issue with sharing the queue between
>> >>> XDP and stack.
>> >>>
>> >> But what is the action for XDP_TX if the queue is stopped? There is no
>> >> qdisc to back pressure in the XDP path. Would we just start dropping
>> >> packets then?
>> >
>> > Yep that is what the patch does if there is any sort of error packets
>> > get dropped on the floor. I don't think there is anything else that
>> > can be done.
>> >
>> That probably means that the stack will always win out under load.
>> Trying to used the same queue where half of the packets are well
>> managed by a qdisc and half aren't is going to leave someone unhappy.
>> Maybe in the this case where we have to share the qdisc we can
>> allocate the skb on on returning XDP_TX and send through the normal
>> qdisc for the device.
>
> I wouldn't go to such extremes for e1k.
> The only reason to have xdp in e1k is to use it for testing
> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.

I imagine someone may want this for the non-forwarding use cases like
early drop for DOS mitigation. Regardless of the use case, I don't
think we can break the fundamental assumptions made for qdiscs or the
rest of the transmit path. If XDP must transmit on a queue shared with
the stack we need to abide by the stack's rules for transmitting on
the queue-- which would mean alloc skbuff and go through qdisc (which
really shouldn't be difficult to implement). Emulating various
functions of the stack in the XDP TX path, like this patch seems to be
doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
trying to keep things coherent.

> Existing stack with skb is perfectly fine as it is.
> No need to do recycling, batching or any other complex things.
> xdp for e1k cannot be used as an example for other drivers either,
> since there is only one tx ring and any high performance adapter
> has more which makes the driver support quite different.
>
John Fastabend Sept. 10, 2016, 3:26 a.m. UTC | #9
On 16-09-09 08:12 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>
>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>
>>>>>>> Hi John,
>>>>>>>
>>>>>>> How does this interact with BQL on e1000?
>>>>>>>
>>>>>>> Tom
>>>>>>>
>>>>>>
>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>> transmission issue a netdev_completed_queue().
>>>>>>
>>>>>> The patch attached here missed a few things though.
>>>>>>
>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>
>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>> three things,
>>>>>>
>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>         netdev_completed_queue()
>>>>>>
>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>> XDP and stack.
>>>>>>
>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>> packets then?
>>>>
>>>> Yep that is what the patch does if there is any sort of error packets
>>>> get dropped on the floor. I don't think there is anything else that
>>>> can be done.
>>>>
>>> That probably means that the stack will always win out under load.
>>> Trying to used the same queue where half of the packets are well
>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>> Maybe in the this case where we have to share the qdisc we can
>>> allocate the skb on on returning XDP_TX and send through the normal
>>> qdisc for the device.
>>
>> I wouldn't go to such extremes for e1k.
>> The only reason to have xdp in e1k is to use it for testing
>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> 
> I imagine someone may want this for the non-forwarding use cases like
> early drop for DOS mitigation. Regardless of the use case, I don't
> think we can break the fundamental assumptions made for qdiscs or the
> rest of the transmit path. If XDP must transmit on a queue shared with
> the stack we need to abide by the stack's rules for transmitting on
> the queue-- which would mean alloc skbuff and go through qdisc (which

If we require XDP_TX to go up to qdisc layer its best not to implement
it at all and just handle it in normal ingress path. That said I think
users have to expect that XDP will interfere with qdisc schemes. Even
with its own tx queue its going to interfere at the hardware level with
bandwidth as the hardware round robins through the queues or uses
whatever hardware strategy it is configured to use. Additionally it
will bypass things like BQL, etc.

> really shouldn't be difficult to implement). Emulating various
> functions of the stack in the XDP TX path, like this patch seems to be
> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
> trying to keep things coherent.

I think bundling tx xmits is fair game as an internal optimization and
doesn't need to be exposed at the XDP layer. Drivers already do this
type of optimizations for allocating buffers. It likely doesn't matter
much at the e1k level but doing a tail update on every pkt with the
40gbps drivers likely will be noticeable is my gut feeling.


> 
>> Existing stack with skb is perfectly fine as it is.
>> No need to do recycling, batching or any other complex things.
>> xdp for e1k cannot be used as an example for other drivers either,
>> since there is only one tx ring and any high performance adapter
>> has more which makes the driver support quite different.
>>
Alexei Starovoitov Sept. 10, 2016, 3:56 a.m. UTC | #10
On Fri, Sep 09, 2016 at 08:12:52PM -0700, Tom Herbert wrote:
> >> That probably means that the stack will always win out under load.
> >> Trying to used the same queue where half of the packets are well
> >> managed by a qdisc and half aren't is going to leave someone unhappy.
> >> Maybe in the this case where we have to share the qdisc we can
> >> allocate the skb on on returning XDP_TX and send through the normal
> >> qdisc for the device.
> >
> > I wouldn't go to such extremes for e1k.
> > The only reason to have xdp in e1k is to use it for testing
> > of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> 
> I imagine someone may want this for the non-forwarding use cases like
> early drop for DOS mitigation.

Sure and they will be doing it on the NICs that they have in their servers.
e1k is not that nic. xdp e1k is for debugging xdp programs in KVM only.
Performance of such xdp programs on e1k is irrelevant.
There is absolutely no need to complicate the driver and the patches.
All other drivers is a different story.
Tom Herbert Sept. 10, 2016, 4:13 a.m. UTC | #11
On Fri, Sep 9, 2016 at 8:26 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 16-09-09 08:12 PM, Tom Herbert wrote:
>> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>>
>>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>>
>>>>>>>> Hi John,
>>>>>>>>
>>>>>>>> How does this interact with BQL on e1000?
>>>>>>>>
>>>>>>>> Tom
>>>>>>>>
>>>>>>>
>>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>>> transmission issue a netdev_completed_queue().
>>>>>>>
>>>>>>> The patch attached here missed a few things though.
>>>>>>>
>>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>>
>>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>>> three things,
>>>>>>>
>>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>>         netdev_completed_queue()
>>>>>>>
>>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>>> XDP and stack.
>>>>>>>
>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>>> packets then?
>>>>>
>>>>> Yep that is what the patch does if there is any sort of error packets
>>>>> get dropped on the floor. I don't think there is anything else that
>>>>> can be done.
>>>>>
>>>> That probably means that the stack will always win out under load.
>>>> Trying to used the same queue where half of the packets are well
>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>>> Maybe in the this case where we have to share the qdisc we can
>>>> allocate the skb on on returning XDP_TX and send through the normal
>>>> qdisc for the device.
>>>
>>> I wouldn't go to such extremes for e1k.
>>> The only reason to have xdp in e1k is to use it for testing
>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
>>
>> I imagine someone may want this for the non-forwarding use cases like
>> early drop for DOS mitigation. Regardless of the use case, I don't
>> think we can break the fundamental assumptions made for qdiscs or the
>> rest of the transmit path. If XDP must transmit on a queue shared with
>> the stack we need to abide by the stack's rules for transmitting on
>> the queue-- which would mean alloc skbuff and go through qdisc (which
>
> If we require XDP_TX to go up to qdisc layer its best not to implement
> it at all and just handle it in normal ingress path. That said I think
> users have to expect that XDP will interfere with qdisc schemes. Even
> with its own tx queue its going to interfere at the hardware level with
> bandwidth as the hardware round robins through the queues or uses
> whatever hardware strategy it is configured to use. Additionally it
> will bypass things like BQL, etc.
>
Right, but not all use cases involve XDP_TX (like DOS mitigation as I
pointed out). Since you've already done 95% of the work, can you take
a look at creating the skbuff and injecting into the stack for XDP_TX
so we can evaluate the performance and impact of that :-)

With separate TX queues it's explicit which queues are managed by the
stack. This is no different than what kernel bypass gives use, we are
relying on HW to do something reasonable in scheduling MQ.

>> really shouldn't be difficult to implement). Emulating various
>> functions of the stack in the XDP TX path, like this patch seems to be
>> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
>> trying to keep things coherent.
>
> I think bundling tx xmits is fair game as an internal optimization and
> doesn't need to be exposed at the XDP layer. Drivers already do this
> type of optimizations for allocating buffers. It likely doesn't matter
> much at the e1k level but doing a tail update on every pkt with the
> 40gbps drivers likely will be noticeable is my gut feeling.
>
How is this different than what XMIT_MORE gives us?

Tom

>
>>
>>> Existing stack with skb is perfectly fine as it is.
>>> No need to do recycling, batching or any other complex things.
>>> xdp for e1k cannot be used as an example for other drivers either,
>>> since there is only one tx ring and any high performance adapter
>>> has more which makes the driver support quite different.
>>>
>
Tom Herbert Sept. 10, 2016, 3:36 p.m. UTC | #12
On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
>
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
>
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |   10 +++
>  drivers/net/ethernet/intel/e1000/e1000_main.c |   81 +++++++++++++++++++------
>  2 files changed, 71 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index 5cf8a0a..877b377 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -133,6 +133,8 @@ struct e1000_adapter;
>  #define E1000_TX_QUEUE_WAKE    16
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
>  #define E1000_RX_BUFFER_WRITE  16 /* Must be power of 2 */
> +/* How many XDP XMIT buffers to bundle into one xmit transaction */
> +#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
>
>  #define AUTO_ALL_MODES         0
>  #define E1000_EEPROM_82544_APM 0x0004
> @@ -168,6 +170,11 @@ struct e1000_rx_buffer {
>         dma_addr_t dma;
>  };
>
> +struct e1000_rx_buffer_bundle {
> +       struct e1000_rx_buffer *buffer;
> +       u32 length;
> +};
> +
>  struct e1000_tx_ring {
>         /* pointer to the descriptor ring memory */
>         void *desc;
> @@ -206,6 +213,9 @@ struct e1000_rx_ring {
>         struct e1000_rx_buffer *buffer_info;
>         struct sk_buff *rx_skb_top;
>
> +       /* array of XDP buffer information structs */
> +       struct e1000_rx_buffer_bundle *xdp_buffer;
> +
>         /* cpu for rx queue */
>         int cpu;
>
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>         struct pci_dev *pdev = adapter->pdev;
>         int size, desc_len;
>
> +       size = sizeof(struct e1000_rx_buffer_bundle) *
> +                       E1000_XDP_XMIT_BUNDLE_MAX;
> +       rxdr->xdp_buffer = vzalloc(size);
> +       if (!rxdr->xdp_buffer)
> +               return -ENOMEM;
> +
>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>         rxdr->buffer_info = vzalloc(size);
> -       if (!rxdr->buffer_info)
> +       if (!rxdr->buffer_info) {
> +               vfree(rxdr->xdp_buffer);

This could be deferred until an XDP program is added.

>                 return -ENOMEM;
> +       }
>
>         desc_len = sizeof(struct e1000_rx_desc);
>
> @@ -1754,6 +1762,7 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>                                         GFP_KERNEL);
>         if (!rxdr->desc) {
>  setup_rx_desc_die:
> +               vfree(rxdr->xdp_buffer);
>                 vfree(rxdr->buffer_info);
>                 return -ENOMEM;
>         }
> @@ -2087,6 +2096,9 @@ static void e1000_free_rx_resources(struct e1000_adapter *adapter,
>
>         e1000_clean_rx_ring(adapter, rx_ring);
>
> +       vfree(rx_ring->xdp_buffer);
> +       rx_ring->xdp_buffer = NULL;
> +
>         vfree(rx_ring->buffer_info);
>         rx_ring->buffer_info = NULL;
>
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -                                unsigned int len,
> -                                struct net_device *netdev,
> -                                struct e1000_adapter *adapter)
> +                                __u32 len,
> +                                struct e1000_adapter *adapter,
> +                                struct e1000_tx_ring *tx_ring)
>  {
> -       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -       struct e1000_hw *hw = &adapter->hw;
> -       struct e1000_tx_ring *tx_ring;
> -
>         if (len > E1000_MAX_DATA_PER_TXD)
>                 return;
>
> +       if (E1000_DESC_UNUSED(tx_ring) < 2)
> +               return;
> +
> +       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +                                 struct net_device *netdev,
> +                                 struct e1000_adapter *adapter)
> +{
> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +       struct e1000_hw *hw = &adapter->hw;
> +       int i = 0;
> +
>         /* e1000 only support a single txq at the moment so the queue is being
>          * shared with stack. To support this requires locking to ensure the
>          * stack and XDP are not running at the same time. Devices with
>          * multiple queues should allocate a separate queue space.
> +        *
> +        * To amortize the locking cost e1000 bundles the xmits and sends as
> +        * many as possible until either running out of descriptors or failing.

Up to E1000_XDP_XMIT_BUNDLE_MAX  at least...

>          */
>         HARD_TX_LOCK(netdev, txq, smp_processor_id());
>
> -       tx_ring = adapter->tx_ring;
> -
> -       if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -               HARD_TX_UNLOCK(netdev, txq);
> -               return;
> +       for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
> +               e1000_xmit_raw_frame(buffer_info[i].buffer,
> +                                    buffer_info[i].length,
> +                                    adapter, tx_ring);
> +               buffer_info[i].buffer->rxbuf.page = NULL;
> +               buffer_info[i].buffer = NULL;
> +               buffer_info[i].length = 0;
> +               i++;
>         }
>
> -       e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -       e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +       /* kick hardware to send bundle and return control back to the stack */
> +       writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +       mmiowb();
>
>         writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>         mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>         struct bpf_prog *prog;
>         u32 length;
>         unsigned int i;
> -       int cleaned_count = 0;
> +       int cleaned_count = 0, xdp_xmit = 0;
>         bool cleaned = false;
>         unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +       struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>
>         rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>         prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>                         case XDP_PASS:
>                                 break;
>                         case XDP_TX:
> +                               xdp_bundle[xdp_xmit].buffer = buffer_info;
> +                               xdp_bundle[xdp_xmit].length = length;
>                                 dma_sync_single_for_device(&pdev->dev,
>                                                            dma,
>                                                            length,
>                                                            DMA_TO_DEVICE);
> -                               e1000_xmit_raw_frame(buffer_info, length,
> -                                                    netdev, adapter);
> -                               buffer_info->rxbuf.page = NULL;
> +                               xdp_xmit++;
>                         case XDP_DROP:
>                         default:
>                                 /* re-use mapped page. keep buffer_info->dma
> @@ -4486,8 +4518,14 @@ next_desc:
>
>                 /* return some buffers to hardware, one at a time is too slow */
>                 if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
> +                       if (xdp_xmit)
> +                               e1000_xdp_xmit_bundle(xdp_bundle,
> +                                                     netdev,
> +                                                     adapter);
> +
>                         adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>                         cleaned_count = 0;
> +                       xdp_xmit = 0;
>                 }
>
>                 /* use prefetched values */
> @@ -4498,8 +4536,11 @@ next_desc:
>         rx_ring->next_to_clean = i;
>
>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
> -       if (cleaned_count)
> +       if (cleaned_count) {
> +               if (xdp_xmit)
> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
> +       }

Looks good for XDP path. Is this something we can abstract out into a
library for use by other drivers?


>
>         adapter->total_rx_packets += total_rx_packets;
>         adapter->total_rx_bytes += total_rx_bytes;
>
John Fastabend Sept. 12, 2016, 3:07 a.m. UTC | #13
On 16-09-10 08:36 AM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
>> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---

[...]

>> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
>> index 91d5c87..b985271 100644
>> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
>> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
>> @@ -1738,10 +1738,18 @@ static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
>>         struct pci_dev *pdev = adapter->pdev;
>>         int size, desc_len;
>>
>> +       size = sizeof(struct e1000_rx_buffer_bundle) *
>> +                       E1000_XDP_XMIT_BUNDLE_MAX;
>> +       rxdr->xdp_buffer = vzalloc(size);
>> +       if (!rxdr->xdp_buffer)
>> +               return -ENOMEM;
>> +
>>         size = sizeof(struct e1000_rx_buffer) * rxdr->count;
>>         rxdr->buffer_info = vzalloc(size);
>> -       if (!rxdr->buffer_info)
>> +       if (!rxdr->buffer_info) {
>> +               vfree(rxdr->xdp_buffer);
> 
> This could be deferred until an XDP program is added.

Yep that would be best to avoid overhead in the normal non-XDP case.
Also I'll move the xdp prog pointer into the rx ring per Jespers comment
that I missed in this rev.

[...]

>> +
>> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
>> +                                 struct net_device *netdev,
>> +                                 struct e1000_adapter *adapter)
>> +{
>> +       struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +       struct e1000_tx_ring *tx_ring = adapter->tx_ring;
>> +       struct e1000_hw *hw = &adapter->hw;
>> +       int i = 0;
>> +
>>         /* e1000 only support a single txq at the moment so the queue is being
>>          * shared with stack. To support this requires locking to ensure the
>>          * stack and XDP are not running at the same time. Devices with
>>          * multiple queues should allocate a separate queue space.
>> +        *
>> +        * To amortize the locking cost e1000 bundles the xmits and sends as
>> +        * many as possible until either running out of descriptors or failing.
> 
> Up to E1000_XDP_XMIT_BUNDLE_MAX  at least...

Yep will fix comment.

[...]

>>
>>                 /* use prefetched values */
>> @@ -4498,8 +4536,11 @@ next_desc:
>>         rx_ring->next_to_clean = i;
>>
>>         cleaned_count = E1000_DESC_UNUSED(rx_ring);
>> -       if (cleaned_count)
>> +       if (cleaned_count) {
>> +               if (xdp_xmit)
>> +                       e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
>>                 adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
>> +       }
> 
> Looks good for XDP path. Is this something we can abstract out into a
> library for use by other drivers?
> 

I'm not really sure it can be abstracted much its a bit intertwined with
the normal rx receive path. But it should probably be a pattern that
gets copied so we avoid unnecessary tx work.

> 
>>
>>         adapter->total_rx_packets += total_rx_packets;
>>         adapter->total_rx_bytes += total_rx_bytes;
>>
John Fastabend Sept. 12, 2016, 3:15 a.m. UTC | #14
On 16-09-09 09:13 PM, Tom Herbert wrote:
> On Fri, Sep 9, 2016 at 8:26 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>> On 16-09-09 08:12 PM, Tom Herbert wrote:
>>> On Fri, Sep 9, 2016 at 6:40 PM, Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com> wrote:
>>>> On Fri, Sep 09, 2016 at 06:19:56PM -0700, Tom Herbert wrote:
>>>>> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>> On 16-09-09 06:04 PM, Tom Herbert wrote:
>>>>>>> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>> On 16-09-09 04:44 PM, Tom Herbert wrote:
>>>>>>>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:
>>>>>>>>>> e1000 supports a single TX queue so it is being shared with the stack
>>>>>>>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>>>>>>>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>>>>>>>>>> lock per packet this patch adds a bundling implementation to submit
>>>>>>>>>> a bundle of packets to the xmit routine.
>>>>>>>>>>
>>>>>>>>>> I tested this patch running e1000 in a VM using KVM over a tap
>>>>>>>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>>>>>>>>>
>>>>>>>>> Hi John,
>>>>>>>>>
>>>>>>>>> How does this interact with BQL on e1000?
>>>>>>>>>
>>>>>>>>> Tom
>>>>>>>>>
>>>>>>>>
>>>>>>>> Let me check if I have the API correct. When we enqueue a packet to
>>>>>>>> be sent we must issue a netdev_sent_queue() call and then on actual
>>>>>>>> transmission issue a netdev_completed_queue().
>>>>>>>>
>>>>>>>> The patch attached here missed a few things though.
>>>>>>>>
>>>>>>>> But it looks like I just need to call netdev_sent_queue() from the
>>>>>>>> e1000_xmit_raw_frame() routine and then let the tx completion logic
>>>>>>>> kick in which will call netdev_completed_queue() correctly.
>>>>>>>>
>>>>>>>> I'll need to add a check for the queue state as well. So if I do these
>>>>>>>> three things,
>>>>>>>>
>>>>>>>>         check __QUEUE_STATE_XOFF before sending
>>>>>>>>         netdev_sent_queue() -> on XDP_TX
>>>>>>>>         netdev_completed_queue()
>>>>>>>>
>>>>>>>> It should work agree? Now should we do this even when XDP owns the
>>>>>>>> queue? Or is this purely an issue with sharing the queue between
>>>>>>>> XDP and stack.
>>>>>>>>
>>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
>>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
>>>>>>> packets then?
>>>>>>
>>>>>> Yep that is what the patch does if there is any sort of error packets
>>>>>> get dropped on the floor. I don't think there is anything else that
>>>>>> can be done.
>>>>>>
>>>>> That probably means that the stack will always win out under load.
>>>>> Trying to used the same queue where half of the packets are well
>>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
>>>>> Maybe in the this case where we have to share the qdisc we can
>>>>> allocate the skb on on returning XDP_TX and send through the normal
>>>>> qdisc for the device.
>>>>
>>>> I wouldn't go to such extremes for e1k.
>>>> The only reason to have xdp in e1k is to use it for testing
>>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
>>>
>>> I imagine someone may want this for the non-forwarding use cases like
>>> early drop for DOS mitigation. Regardless of the use case, I don't
>>> think we can break the fundamental assumptions made for qdiscs or the
>>> rest of the transmit path. If XDP must transmit on a queue shared with
>>> the stack we need to abide by the stack's rules for transmitting on
>>> the queue-- which would mean alloc skbuff and go through qdisc (which
>>
>> If we require XDP_TX to go up to qdisc layer its best not to implement
>> it at all and just handle it in normal ingress path. That said I think
>> users have to expect that XDP will interfere with qdisc schemes. Even
>> with its own tx queue its going to interfere at the hardware level with
>> bandwidth as the hardware round robins through the queues or uses
>> whatever hardware strategy it is configured to use. Additionally it
>> will bypass things like BQL, etc.
>>
> Right, but not all use cases involve XDP_TX (like DOS mitigation as I
> pointed out). Since you've already done 95% of the work, can you take
> a look at creating the skbuff and injecting into the stack for XDP_TX
> so we can evaluate the performance and impact of that :-)
> 
> With separate TX queues it's explicit which queues are managed by the
> stack. This is no different than what kernel bypass gives use, we are
> relying on HW to do something reasonable in scheduling MQ.
> 

How about instead of dropping packets on xdp errors we make the
behavior to send the packet to the stack by default. Then the stack can
decide what to do with it. This is easier from the drivers perspective
and avoids creating a qdisc inject path for XDP. We could set the mark
field if the stack wants to handle XDP exceptions somehow differently.

If we really want XDP to have an inject path I think we should add
another action XDP_QDISC_INJECT. And add some way for XDP to run
programs on exceptions. Perhaps via an exception map.

In this flow when an exception occurs in some path the exception map
is consulted and the exception handler is run. I think its better to
be very explicit when falling back to the stack vs doing it
transparently.

Notice even in the dedicated queue case errors may occur when
descriptors are exhausted and other transient errors occur.

>>> really shouldn't be difficult to implement). Emulating various
>>> functions of the stack in the XDP TX path, like this patch seems to be
>>> doing for XMIT_MORE, potentially gets us into a wack-a-mole situation
>>> trying to keep things coherent.
>>
>> I think bundling tx xmits is fair game as an internal optimization and
>> doesn't need to be exposed at the XDP layer. Drivers already do this
>> type of optimizations for allocating buffers. It likely doesn't matter
>> much at the e1k level but doing a tail update on every pkt with the
>> 40gbps drivers likely will be noticeable is my gut feeling.
>>
> How is this different than what XMIT_MORE gives us?
> 

Its not really. Except there is no call signaling. The code path just
bundles up as many packets as it can and throws them onto the xmit
routine.

> Tom
> 
>>
>>>
>>>> Existing stack with skb is perfectly fine as it is.
>>>> No need to do recycling, batching or any other complex things.
>>>> xdp for e1k cannot be used as an example for other drivers either,
>>>> since there is only one tx ring and any high performance adapter
>>>> has more which makes the driver support quite different.
>>>>
>>
Alexei Starovoitov Sept. 12, 2016, 4:12 a.m. UTC | #15
On Sun, Sep 11, 2016 at 08:15:28PM -0700, John Fastabend wrote:
> >>>>>>>>
> >>>>>>> But what is the action for XDP_TX if the queue is stopped? There is no
> >>>>>>> qdisc to back pressure in the XDP path. Would we just start dropping
> >>>>>>> packets then?
> >>>>>>
> >>>>>> Yep that is what the patch does if there is any sort of error packets
> >>>>>> get dropped on the floor. I don't think there is anything else that
> >>>>>> can be done.
> >>>>>>
> >>>>> That probably means that the stack will always win out under load.
> >>>>> Trying to used the same queue where half of the packets are well
> >>>>> managed by a qdisc and half aren't is going to leave someone unhappy.
> >>>>> Maybe in the this case where we have to share the qdisc we can
> >>>>> allocate the skb on on returning XDP_TX and send through the normal
> >>>>> qdisc for the device.
> >>>>
> >>>> I wouldn't go to such extremes for e1k.
> >>>> The only reason to have xdp in e1k is to use it for testing
> >>>> of xdp programs. Nothing else. e1k is, best case, 1Gbps adapter.
> >>>
> >>> I imagine someone may want this for the non-forwarding use cases like
> >>> early drop for DOS mitigation. Regardless of the use case, I don't
> >>> think we can break the fundamental assumptions made for qdiscs or the
> >>> rest of the transmit path. If XDP must transmit on a queue shared with
> >>> the stack we need to abide by the stack's rules for transmitting on
> >>> the queue-- which would mean alloc skbuff and go through qdisc (which
> >>
> >> If we require XDP_TX to go up to qdisc layer its best not to implement
> >> it at all and just handle it in normal ingress path. That said I think
> >> users have to expect that XDP will interfere with qdisc schemes. Even
> >> with its own tx queue its going to interfere at the hardware level with
> >> bandwidth as the hardware round robins through the queues or uses
> >> whatever hardware strategy it is configured to use. Additionally it
> >> will bypass things like BQL, etc.
> >>
> > Right, but not all use cases involve XDP_TX (like DOS mitigation as I
> > pointed out). Since you've already done 95% of the work, can you take
> > a look at creating the skbuff and injecting into the stack for XDP_TX
> > so we can evaluate the performance and impact of that :-)
> > 
> > With separate TX queues it's explicit which queues are managed by the
> > stack. This is no different than what kernel bypass gives use, we are
> > relying on HW to do something reasonable in scheduling MQ.
> > 
> 
> How about instead of dropping packets on xdp errors we make the
> behavior to send the packet to the stack by default. Then the stack can
> decide what to do with it. This is easier from the drivers perspective
> and avoids creating a qdisc inject path for XDP. We could set the mark
> field if the stack wants to handle XDP exceptions somehow differently.
> 
> If we really want XDP to have an inject path I think we should add
> another action XDP_QDISC_INJECT. And add some way for XDP to run
> programs on exceptions. Perhaps via an exception map.

Nack for any new features just for e1k.
I don't like where this discussion is going.
I've been hacking xdp support for e1k only to be able to debug
xdp programs in kvm instead of messing with physical hosts where
every bpf program mistake kills ssh connection.
Please stop this overdesign. I'd rather not have xdp for e1k
instead of going into this crazy new action codes and random
punt to stack. If there is a conflict between stack and xdp, just
drop the packet. e1k is _not_ an example for any other drivers.
When high performance NIC will have such tx ring sharing issues
only then we'd need to come with a solution. Currently that's not
the case, so there is no need to come up with anything but
the simplest approach.
Jesper Dangaard Brouer Sept. 12, 2016, 11:56 a.m. UTC | #16
On Fri, 9 Sep 2016 18:19:56 -0700 Tom Herbert <tom@herbertland.com> wrote:
> On Fri, Sep 9, 2016 at 6:12 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> > On 16-09-09 06:04 PM, Tom Herbert wrote:  
> >> On Fri, Sep 9, 2016 at 5:01 PM, John Fastabend <john.fastabend@gmail.com> wrote:  
> >>> On 16-09-09 04:44 PM, Tom Herbert wrote:  
> >>>> On Fri, Sep 9, 2016 at 2:29 PM, John Fastabend <john.fastabend@gmail.com> wrote:  
> >>>>> e1000 supports a single TX queue so it is being shared with the stack
> >>>>> when XDP runs XDP_TX action. This requires taking the xmit lock to
> >>>>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> >>>>> lock per packet this patch adds a bundling implementation to submit
> >>>>> a bundle of packets to the xmit routine.
> >>>>>
> >>>>> I tested this patch running e1000 in a VM using KVM over a tap
> >>>>> device using pktgen to generate traffic along with 'ping -f -l 100'.
> >>>>>  
> >>>> Hi John,
> >>>>
> >>>> How does this interact with BQL on e1000?
> >>>>
> >>>> Tom
> >>>>  
> >>>
> >>> Let me check if I have the API correct. When we enqueue a packet to
> >>> be sent we must issue a netdev_sent_queue() call and then on actual
> >>> transmission issue a netdev_completed_queue().
> >>>
> >>> The patch attached here missed a few things though.
> >>>
> >>> But it looks like I just need to call netdev_sent_queue() from the
> >>> e1000_xmit_raw_frame() routine and then let the tx completion logic
> >>> kick in which will call netdev_completed_queue() correctly.
> >>>
> >>> I'll need to add a check for the queue state as well. So if I do these
> >>> three things,
> >>>
> >>>         check __QUEUE_STATE_XOFF before sending
> >>>         netdev_sent_queue() -> on XDP_TX
> >>>         netdev_completed_queue()
> >>>
> >>> It should work agree? Now should we do this even when XDP owns the
> >>> queue? Or is this purely an issue with sharing the queue between
> >>> XDP and stack.
> >>>  
> >> But what is the action for XDP_TX if the queue is stopped? There is no
> >> qdisc to back pressure in the XDP path. Would we just start dropping
> >> packets then?  
> >
> > Yep that is what the patch does if there is any sort of error packets
> > get dropped on the floor. I don't think there is anything else that
> > can be done.

I agree, the only option is the drop the packet. For a DDoS use-case,
this is good, because this "switch" XDP into a more efficient mode
(direct recycling pages).

> >  
> That probably means that the stack will always win out under load.

Why would the stack win? Wouldn't XDP_TX win?

> Trying to used the same queue where half of the packets are well
> managed by a qdisc and half aren't is going to leave someone unhappy.
> Maybe in the this case where we have to share the qdisc we can
> allocate the skb on on returning XDP_TX and send through the normal
> qdisc for the device.

Hmmm. I'm not sure I like the approach of allocating an SKB, and
injecting into the qdisc.  Most of the performance gain goes out the
window.  Unless, we (1) bulk alloc SKBs, and (2) can avoid initializing
the entire SKB, and (3) bulk enqueue into qdisc.  It would be an
interesting "tool" for a zoom-in benchmark, what would allow us to
determine the cost/overhead of the network stack between RX to
qdisc-enqueue.
Jesper Dangaard Brouer Sept. 12, 2016, 12:17 p.m. UTC | #17
On Fri, 09 Sep 2016 14:29:38 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> e1000 supports a single TX queue so it is being shared with the stack
> when XDP runs XDP_TX action. This requires taking the xmit lock to
> ensure we don't corrupt the tx ring. To avoid taking and dropping the
> lock per packet this patch adds a bundling implementation to submit
> a bundle of packets to the xmit routine.
> 
> I tested this patch running e1000 in a VM using KVM over a tap
> device using pktgen to generate traffic along with 'ping -f -l 100'.
> 
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>

Thank you for actually implementing this! :-)

> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
[...]
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index 91d5c87..b985271 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
>  
[...]
> @@ -3369,33 +3381,52 @@ static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
>  }
>  
>  static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> -				 unsigned int len,
> -				 struct net_device *netdev,
> -				 struct e1000_adapter *adapter)
> +				 __u32 len,
> +				 struct e1000_adapter *adapter,
> +				 struct e1000_tx_ring *tx_ring)
>  {
> -	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> -	struct e1000_hw *hw = &adapter->hw;
> -	struct e1000_tx_ring *tx_ring;
> -
>  	if (len > E1000_MAX_DATA_PER_TXD)
>  		return;
>  
> +	if (E1000_DESC_UNUSED(tx_ring) < 2)
> +		return;
> +
> +	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +}
> +
> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
> +				  struct net_device *netdev,
> +				  struct e1000_adapter *adapter)
> +{
> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
> +	struct e1000_hw *hw = &adapter->hw;
> +	int i = 0;
> +
>  	/* e1000 only support a single txq at the moment so the queue is being
>  	 * shared with stack. To support this requires locking to ensure the
>  	 * stack and XDP are not running at the same time. Devices with
>  	 * multiple queues should allocate a separate queue space.
> +	 *
> +	 * To amortize the locking cost e1000 bundles the xmits and sends as
> +	 * many as possible until either running out of descriptors or failing.
>  	 */
>  	HARD_TX_LOCK(netdev, txq, smp_processor_id());
>  
> -	tx_ring = adapter->tx_ring;
> -
> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
> -		HARD_TX_UNLOCK(netdev, txq);
> -		return;
> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
                                                                       ^^^
> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
> +				     buffer_info[i].length,
> +				     adapter, tx_ring);
> +		buffer_info[i].buffer->rxbuf.page = NULL;
> +		buffer_info[i].buffer = NULL;
> +		buffer_info[i].length = 0;
> +		i++;
                ^^^
Looks like "i" is incremented twice, is that correct?

>  	}
>  
> -	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> -	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +	/* kick hardware to send bundle and return control back to the stack */
> +	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +	mmiowb();
>  
>  	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
>  	mmiowb();
> @@ -4281,9 +4312,10 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  	struct bpf_prog *prog;
>  	u32 length;
>  	unsigned int i;
> -	int cleaned_count = 0;
> +	int cleaned_count = 0, xdp_xmit = 0;
>  	bool cleaned = false;
>  	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> +	struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
>  
>  	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
>  	prog = READ_ONCE(adapter->prog);
> @@ -4338,13 +4370,13 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  			case XDP_PASS:
>  				break;
>  			case XDP_TX:
> +				xdp_bundle[xdp_xmit].buffer = buffer_info;
> +				xdp_bundle[xdp_xmit].length = length;
>  				dma_sync_single_for_device(&pdev->dev,
>  							   dma,
>  							   length,
>  							   DMA_TO_DEVICE);
> -				e1000_xmit_raw_frame(buffer_info, length,
> -						     netdev, adapter);
> -				buffer_info->rxbuf.page = NULL;
> +				xdp_xmit++;
>  			case XDP_DROP:
>  			default:
>  				/* re-use mapped page. keep buffer_info->dma
John Fastabend Sept. 12, 2016, 6:11 p.m. UTC | #18
On 16-09-12 05:17 AM, Jesper Dangaard Brouer wrote:
> On Fri, 09 Sep 2016 14:29:38 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
> 
>> e1000 supports a single TX queue so it is being shared with the stack
>> when XDP runs XDP_TX action. This requires taking the xmit lock to
>> ensure we don't corrupt the tx ring. To avoid taking and dropping the
>> lock per packet this patch adds a bundling implementation to submit
>> a bundle of packets to the xmit routine.
>>
>> I tested this patch running e1000 in a VM using KVM over a tap
>> device using pktgen to generate traffic along with 'ping -f -l 100'.
>>
>> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> Thank you for actually implementing this! :-)
> 

Yep no problem the effects are minimal on e1000 but should be
noticeable at 10/40/100gbps nics.

>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
> [...]


[...]

>> +static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
>> +				  struct net_device *netdev,
>> +				  struct e1000_adapter *adapter)
>> +{
>> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
>> +	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
>> +	struct e1000_hw *hw = &adapter->hw;
>> +	int i = 0;
>> +
>>  	/* e1000 only support a single txq at the moment so the queue is being
>>  	 * shared with stack. To support this requires locking to ensure the
>>  	 * stack and XDP are not running at the same time. Devices with
>>  	 * multiple queues should allocate a separate queue space.
>> +	 *
>> +	 * To amortize the locking cost e1000 bundles the xmits and sends as
>> +	 * many as possible until either running out of descriptors or failing.
>>  	 */
>>  	HARD_TX_LOCK(netdev, txq, smp_processor_id());
>>  
>> -	tx_ring = adapter->tx_ring;
>> -
>> -	if (E1000_DESC_UNUSED(tx_ring) < 2) {
>> -		HARD_TX_UNLOCK(netdev, txq);
>> -		return;
>> +	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
>                                                                        ^^^
>> +		e1000_xmit_raw_frame(buffer_info[i].buffer,
>> +				     buffer_info[i].length,
>> +				     adapter, tx_ring);
>> +		buffer_info[i].buffer->rxbuf.page = NULL;
>> +		buffer_info[i].buffer = NULL;
>> +		buffer_info[i].length = 0;
>> +		i++;
>                 ^^^
> Looks like "i" is incremented twice, is that correct?
> 
>>  	}

Yep this and a couple other issues are resolved in v3 which I'll send
out in a moment.

Also in v3 I kept the program in the adapter structure. Moving it into
the ring structure made the code a bit uglier IMO. I agree with the
logic but practically only one program can exist for e1000.
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index 5cf8a0a..877b377 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -133,6 +133,8 @@  struct e1000_adapter;
 #define E1000_TX_QUEUE_WAKE	16
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
 #define E1000_RX_BUFFER_WRITE	16 /* Must be power of 2 */
+/* How many XDP XMIT buffers to bundle into one xmit transaction */
+#define E1000_XDP_XMIT_BUNDLE_MAX E1000_RX_BUFFER_WRITE
 
 #define AUTO_ALL_MODES		0
 #define E1000_EEPROM_82544_APM	0x0004
@@ -168,6 +170,11 @@  struct e1000_rx_buffer {
 	dma_addr_t dma;
 };
 
+struct e1000_rx_buffer_bundle {
+	struct e1000_rx_buffer *buffer;
+	u32 length;
+};
+
 struct e1000_tx_ring {
 	/* pointer to the descriptor ring memory */
 	void *desc;
@@ -206,6 +213,9 @@  struct e1000_rx_ring {
 	struct e1000_rx_buffer *buffer_info;
 	struct sk_buff *rx_skb_top;
 
+	/* array of XDP buffer information structs */
+	struct e1000_rx_buffer_bundle *xdp_buffer;
+
 	/* cpu for rx queue */
 	int cpu;
 
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 91d5c87..b985271 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -1738,10 +1738,18 @@  static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
 	struct pci_dev *pdev = adapter->pdev;
 	int size, desc_len;
 
+	size = sizeof(struct e1000_rx_buffer_bundle) *
+			E1000_XDP_XMIT_BUNDLE_MAX;
+	rxdr->xdp_buffer = vzalloc(size);
+	if (!rxdr->xdp_buffer)
+		return -ENOMEM;
+
 	size = sizeof(struct e1000_rx_buffer) * rxdr->count;
 	rxdr->buffer_info = vzalloc(size);
-	if (!rxdr->buffer_info)
+	if (!rxdr->buffer_info) {
+		vfree(rxdr->xdp_buffer);
 		return -ENOMEM;
+	}
 
 	desc_len = sizeof(struct e1000_rx_desc);
 
@@ -1754,6 +1762,7 @@  static int e1000_setup_rx_resources(struct e1000_adapter *adapter,
 					GFP_KERNEL);
 	if (!rxdr->desc) {
 setup_rx_desc_die:
+		vfree(rxdr->xdp_buffer);
 		vfree(rxdr->buffer_info);
 		return -ENOMEM;
 	}
@@ -2087,6 +2096,9 @@  static void e1000_free_rx_resources(struct e1000_adapter *adapter,
 
 	e1000_clean_rx_ring(adapter, rx_ring);
 
+	vfree(rx_ring->xdp_buffer);
+	rx_ring->xdp_buffer = NULL;
+
 	vfree(rx_ring->buffer_info);
 	rx_ring->buffer_info = NULL;
 
@@ -3369,33 +3381,52 @@  static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
 }
 
 static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
-				 unsigned int len,
-				 struct net_device *netdev,
-				 struct e1000_adapter *adapter)
+				 __u32 len,
+				 struct e1000_adapter *adapter,
+				 struct e1000_tx_ring *tx_ring)
 {
-	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
-	struct e1000_hw *hw = &adapter->hw;
-	struct e1000_tx_ring *tx_ring;
-
 	if (len > E1000_MAX_DATA_PER_TXD)
 		return;
 
+	if (E1000_DESC_UNUSED(tx_ring) < 2)
+		return;
+
+	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
+	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+}
+
+static void e1000_xdp_xmit_bundle(struct e1000_rx_buffer_bundle *buffer_info,
+				  struct net_device *netdev,
+				  struct e1000_adapter *adapter)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
+	struct e1000_tx_ring *tx_ring = adapter->tx_ring;
+	struct e1000_hw *hw = &adapter->hw;
+	int i = 0;
+
 	/* e1000 only support a single txq at the moment so the queue is being
 	 * shared with stack. To support this requires locking to ensure the
 	 * stack and XDP are not running at the same time. Devices with
 	 * multiple queues should allocate a separate queue space.
+	 *
+	 * To amortize the locking cost e1000 bundles the xmits and sends as
+	 * many as possible until either running out of descriptors or failing.
 	 */
 	HARD_TX_LOCK(netdev, txq, smp_processor_id());
 
-	tx_ring = adapter->tx_ring;
-
-	if (E1000_DESC_UNUSED(tx_ring) < 2) {
-		HARD_TX_UNLOCK(netdev, txq);
-		return;
+	for (; i < E1000_XDP_XMIT_BUNDLE_MAX && buffer_info[i].buffer; i++) {
+		e1000_xmit_raw_frame(buffer_info[i].buffer,
+				     buffer_info[i].length,
+				     adapter, tx_ring);
+		buffer_info[i].buffer->rxbuf.page = NULL;
+		buffer_info[i].buffer = NULL;
+		buffer_info[i].length = 0;
+		i++;
 	}
 
-	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
-	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
+	/* kick hardware to send bundle and return control back to the stack */
+	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
+	mmiowb();
 
 	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
 	mmiowb();
@@ -4281,9 +4312,10 @@  static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 	struct bpf_prog *prog;
 	u32 length;
 	unsigned int i;
-	int cleaned_count = 0;
+	int cleaned_count = 0, xdp_xmit = 0;
 	bool cleaned = false;
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	struct e1000_rx_buffer_bundle *xdp_bundle = rx_ring->xdp_buffer;
 
 	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
 	prog = READ_ONCE(adapter->prog);
@@ -4338,13 +4370,13 @@  static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
 			case XDP_PASS:
 				break;
 			case XDP_TX:
+				xdp_bundle[xdp_xmit].buffer = buffer_info;
+				xdp_bundle[xdp_xmit].length = length;
 				dma_sync_single_for_device(&pdev->dev,
 							   dma,
 							   length,
 							   DMA_TO_DEVICE);
-				e1000_xmit_raw_frame(buffer_info, length,
-						     netdev, adapter);
-				buffer_info->rxbuf.page = NULL;
+				xdp_xmit++;
 			case XDP_DROP:
 			default:
 				/* re-use mapped page. keep buffer_info->dma
@@ -4486,8 +4518,14 @@  next_desc:
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
+			if (xdp_xmit)
+				e1000_xdp_xmit_bundle(xdp_bundle,
+						      netdev,
+						      adapter);
+
 			adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
 			cleaned_count = 0;
+			xdp_xmit = 0;
 		}
 
 		/* use prefetched values */
@@ -4498,8 +4536,11 @@  next_desc:
 	rx_ring->next_to_clean = i;
 
 	cleaned_count = E1000_DESC_UNUSED(rx_ring);
-	if (cleaned_count)
+	if (cleaned_count) {
+		if (xdp_xmit)
+			e1000_xdp_xmit_bundle(xdp_bundle, netdev, adapter);
 		adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count);
+	}
 
 	adapter->total_rx_packets += total_rx_packets;
 	adapter->total_rx_bytes += total_rx_bytes;