Message ID | 20200902203222.185141-1-anthony.l.nguyen@intel.com |
---|---|
State | Accepted |
Delegated to: | Anthony Nguyen |
Headers | show |
Series | [next-queue,v7] igb: add XDP support | expand |
On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote: > From: Sven Auhagen <Sven.Auhagen@voleatech.de> > > Add XDP support to the IGB driver. > The implementation follows the IXGBE XDP implementation > closely and I used the following patches as basis: > > 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop actions") > 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX action") > 3. commit ed93a3987128 ("ixgbe: tweak page counting for XDP_REDIRECT") > > Due to the hardware constraints of the devices using the > IGB driver we must share the TX queues with XDP which > means locking the TX queue for XDP. > > I ran tests on an older device to get better numbers. > Test machine: > > Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores) > 2x Intel I211 > > Routing Original Driver Network Stack: 382 Kpps > > Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps > XDP Drop: 1.48 Mpps > > Using XDP we can achieve line rate forwarding even on > an older Intel Atom CPU. > > Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de> Hello Tony, thanks for the patch update. How is the process to get this accepted? Best Sven > --- > v7: > * Fix issue with applying to dev-queue branch; utilize net_prefetch() > * Fix build issue; remove XDP_QUERY_PROG > * Replace fallthrough comment with fallthrough macro to resolve > checkpatch warning > * Fix reverse Christmas tree > > v6: > * igb_xdp_ring_update_tail changed to static > * bump to 5.8 > > v5: resubmission with function names in patch > > v4: > * use HARD_TX_LOCK in XDP xmit > * do not pass adapter to igb_setup_rx_resources > * account for timestamp in frame size > > v3: igb_xdp_ring_update_tail should be static > > v2: original did not apply to my dev-queue branch, so fixed the > conflicts in the patch > > drivers/net/ethernet/intel/igb/igb.h | 81 +++- > drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 + > drivers/net/ethernet/intel/igb/igb_main.c | 433 +++++++++++++++++-- > 3 files changed, 482 insertions(+), 36 deletions(-) > > diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h > index 2f015b60a995..fa6ff1a64fc0 100644 > --- a/drivers/net/ethernet/intel/igb/igb.h > +++ b/drivers/net/ethernet/intel/igb/igb.h > @@ -19,6 +19,8 @@ > #include <linux/pci.h> > #include <linux/mdio.h> > > +#include <net/xdp.h> > + > struct igb_adapter; > > #define E1000_PCS_CFG_IGN_SD 1 > @@ -79,6 +81,12 @@ struct igb_adapter; > #define IGB_I210_RX_LATENCY_100 2213 > #define IGB_I210_RX_LATENCY_1000 448 > > +/* XDP */ > +#define IGB_XDP_PASS 0 > +#define IGB_XDP_CONSUMED BIT(0) > +#define IGB_XDP_TX BIT(1) > +#define IGB_XDP_REDIR BIT(2) > + > struct vf_data_storage { > unsigned char vf_mac_addresses[ETH_ALEN]; > u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES]; > @@ -132,17 +140,63 @@ struct vf_mac_filter { > > /* Supported Rx Buffer Sizes */ > #define IGB_RXBUFFER_256 256 > +#define IGB_RXBUFFER_1536 1536 > #define IGB_RXBUFFER_2048 2048 > #define IGB_RXBUFFER_3072 3072 > #define IGB_RX_HDR_LEN IGB_RXBUFFER_256 > #define IGB_TS_HDR_LEN 16 > > -#define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) > +/* Attempt to maximize the headroom available for incoming frames. We > + * use a 2K buffer for receives and need 1536/1534 to store the data for > + * the frame. This leaves us with 512 bytes of room. From that we need > + * to deduct the space needed for the shared info and the padding needed > + * to IP align the frame. > + * > + * Note: For cache line sizes 256 or larger this value is going to end > + * up negative. In these cases we should fall back to the 3K > + * buffers. > + */ > #if (PAGE_SIZE < 8192) > -#define IGB_MAX_FRAME_BUILD_SKB \ > - (SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048) - IGB_SKB_PAD - IGB_TS_HDR_LEN) > +#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_1536 - NET_IP_ALIGN) > +#define IGB_2K_TOO_SMALL_WITH_PADDING \ > +((NET_SKB_PAD + IGB_TS_HDR_LEN + IGB_RXBUFFER_1536) > \ > +SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048)) > + > +static inline int igb_compute_pad(int rx_buf_len) > +{ > + int page_size, pad_size; > + > + page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2); > + pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len; > + > + return pad_size; > +} > + > +static inline int igb_skb_pad(void) > +{ > + int rx_buf_len; > + > + /* If a 2K buffer cannot handle a standard Ethernet frame then > + * optimize padding for a 3K buffer instead of a 1.5K buffer. > + * > + * For a 3K buffer we need to add enough padding to allow for > + * tailroom due to NET_IP_ALIGN possibly shifting us out of > + * cache-line alignment. > + */ > + if (IGB_2K_TOO_SMALL_WITH_PADDING) > + rx_buf_len = IGB_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN); > + else > + rx_buf_len = IGB_RXBUFFER_1536; > + > + /* if needed make room for NET_IP_ALIGN */ > + rx_buf_len -= NET_IP_ALIGN; > + > + return igb_compute_pad(rx_buf_len); > +} > + > +#define IGB_SKB_PAD igb_skb_pad() > #else > -#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_2048 - IGB_TS_HDR_LEN) > +#define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) > #endif > > /* How many Rx Buffers do we bundle into one write to the hardware ? */ > @@ -194,13 +248,22 @@ enum igb_tx_flags { > #define IGB_SFF_ADDRESSING_MODE 0x4 > #define IGB_SFF_8472_UNSUP 0x00 > > +enum igb_tx_buf_type { > + IGB_TYPE_SKB = 0, > + IGB_TYPE_XDP, > +}; > + > /* wrapper around a pointer to a socket buffer, > * so a DMA handle can be stored along with the buffer > */ > struct igb_tx_buffer { > union e1000_adv_tx_desc *next_to_watch; > unsigned long time_stamp; > - struct sk_buff *skb; > + enum igb_tx_buf_type type; > + union { > + struct sk_buff *skb; > + struct xdp_frame *xdpf; > + }; > unsigned int bytecount; > u16 gso_segs; > __be16 protocol; > @@ -248,6 +311,7 @@ struct igb_ring_container { > struct igb_ring { > struct igb_q_vector *q_vector; /* backlink to q_vector */ > struct net_device *netdev; /* back pointer to net_device */ > + struct bpf_prog *xdp_prog; > struct device *dev; /* device pointer for dma mapping */ > union { /* array of buffer info structs */ > struct igb_tx_buffer *tx_buffer_info; > @@ -288,6 +352,7 @@ struct igb_ring { > struct u64_stats_sync rx_syncp; > }; > }; > + struct xdp_rxq_info xdp_rxq; > } ____cacheline_internodealigned_in_smp; > > struct igb_q_vector { > @@ -339,7 +404,7 @@ static inline unsigned int igb_rx_bufsz(struct igb_ring *ring) > return IGB_RXBUFFER_3072; > > if (ring_uses_build_skb(ring)) > - return IGB_MAX_FRAME_BUILD_SKB + IGB_TS_HDR_LEN; > + return IGB_MAX_FRAME_BUILD_SKB; > #endif > return IGB_RXBUFFER_2048; > } > @@ -467,6 +532,7 @@ struct igb_adapter { > unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; > > struct net_device *netdev; > + struct bpf_prog *xdp_prog; > > unsigned long state; > unsigned int flags; > @@ -643,6 +709,9 @@ enum igb_boards { > > extern char igb_driver_name[]; > > +int igb_xmit_xdp_ring(struct igb_adapter *adapter, > + struct igb_ring *ring, > + struct xdp_frame *xdpf); > int igb_open(struct net_device *netdev); > int igb_close(struct net_device *netdev); > int igb_up(struct igb_adapter *); > diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c > index 6e8231c1ddf0..28baf203459a 100644 > --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c > +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c > @@ -961,6 +961,10 @@ static int igb_set_ringparam(struct net_device *netdev, > memcpy(&temp_ring[i], adapter->rx_ring[i], > sizeof(struct igb_ring)); > > + /* Clear copied XDP RX-queue info */ > + memset(&temp_ring[i].xdp_rxq, 0, > + sizeof(temp_ring[i].xdp_rxq)); > + > temp_ring[i].count = new_rx_count; > err = igb_setup_rx_resources(&temp_ring[i]); > if (err) { > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c > index 698bb6a4b088..73635a012f4a 100644 > --- a/drivers/net/ethernet/intel/igb/igb_main.c > +++ b/drivers/net/ethernet/intel/igb/igb_main.c > @@ -30,6 +30,8 @@ > #include <linux/if_ether.h> > #include <linux/aer.h> > #include <linux/prefetch.h> > +#include <linux/bpf.h> > +#include <linux/bpf_trace.h> > #include <linux/pm_runtime.h> > #include <linux/etherdevice.h> > #ifdef CONFIG_IGB_DCA > @@ -2825,6 +2827,147 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, > } > } > > +static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog) > +{ > + int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; > + struct igb_adapter *adapter = netdev_priv(dev); > + bool running = netif_running(dev); > + struct bpf_prog *old_prog; > + bool need_reset; > + > + /* verify igb ring attributes are sufficient for XDP */ > + for (i = 0; i < adapter->num_rx_queues; i++) { > + struct igb_ring *ring = adapter->rx_ring[i]; > + > + if (frame_size > igb_rx_bufsz(ring)) > + return -EINVAL; > + } > + > + old_prog = xchg(&adapter->xdp_prog, prog); > + need_reset = (!!prog != !!old_prog); > + > + /* device is up and bpf is added/removed, must setup the RX queues */ > + if (need_reset && running) { > + igb_close(dev); > + } else { > + for (i = 0; i < adapter->num_rx_queues; i++) > + (void)xchg(&adapter->rx_ring[i]->xdp_prog, > + adapter->xdp_prog); > + } > + > + if (old_prog) > + bpf_prog_put(old_prog); > + > + /* bpf is just replaced, RXQ and MTU are already setup */ > + if (!need_reset) > + return 0; > + > + if (running) > + igb_open(dev); > + > + return 0; > +} > + > +static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) > +{ > + switch (xdp->command) { > + case XDP_SETUP_PROG: > + return igb_xdp_setup(dev, xdp->prog); > + default: > + return -EINVAL; > + } > +} > + > +static void igb_xdp_ring_update_tail(struct igb_ring *ring) > +{ > + /* Force memory writes to complete before letting h/w know there > + * are new descriptors to fetch. > + */ > + wmb(); > + writel(ring->next_to_use, ring->tail); > +} > + > +static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter) > +{ > + unsigned int r_idx = smp_processor_id(); > + > + if (r_idx >= adapter->num_tx_queues) > + r_idx = r_idx % adapter->num_tx_queues; > + > + return adapter->tx_ring[r_idx]; > +} > + > +static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) > +{ > + struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); > + int cpu = smp_processor_id(); > + struct igb_ring *tx_ring; > + struct netdev_queue *nq; > + u32 ret; > + > + if (unlikely(!xdpf)) > + return IGB_XDP_CONSUMED; > + > + /* During program transitions its possible adapter->xdp_prog is assigned > + * but ring has not been configured yet. In this case simply abort xmit. > + */ > + tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; > + if (unlikely(!tx_ring)) > + return -ENXIO; > + > + nq = txring_txq(tx_ring); > + __netif_tx_lock(nq, cpu); > + ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); > + __netif_tx_unlock(nq); > + > + return ret; > +} > + > +static int igb_xdp_xmit(struct net_device *dev, int n, > + struct xdp_frame **frames, u32 flags) > +{ > + struct igb_adapter *adapter = netdev_priv(dev); > + int cpu = smp_processor_id(); > + struct igb_ring *tx_ring; > + struct netdev_queue *nq; > + int drops = 0; > + int i; > + > + if (unlikely(test_bit(__IGB_DOWN, &adapter->state))) > + return -ENETDOWN; > + > + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) > + return -EINVAL; > + > + /* During program transitions its possible adapter->xdp_prog is assigned > + * but ring has not been configured yet. In this case simply abort xmit. > + */ > + tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; > + if (unlikely(!tx_ring)) > + return -ENXIO; > + > + nq = txring_txq(tx_ring); > + __netif_tx_lock(nq, cpu); > + > + for (i = 0; i < n; i++) { > + struct xdp_frame *xdpf = frames[i]; > + int err; > + > + err = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); > + if (err != IGB_XDP_TX) { > + xdp_return_frame_rx_napi(xdpf); > + drops++; > + } > + } > + > + __netif_tx_unlock(nq); > + > + if (unlikely(flags & XDP_XMIT_FLUSH)) > + igb_xdp_ring_update_tail(tx_ring); > + > + return n - drops; > +} > + > static const struct net_device_ops igb_netdev_ops = { > .ndo_open = igb_open, > .ndo_stop = igb_close, > @@ -2849,6 +2992,8 @@ static const struct net_device_ops igb_netdev_ops = { > .ndo_fdb_add = igb_ndo_fdb_add, > .ndo_features_check = igb_features_check, > .ndo_setup_tc = igb_setup_tc, > + .ndo_bpf = igb_xdp, > + .ndo_xdp_xmit = igb_xdp_xmit, > }; > > /** > @@ -4179,6 +4324,7 @@ static void igb_configure_tx(struct igb_adapter *adapter) > **/ > int igb_setup_rx_resources(struct igb_ring *rx_ring) > { > + struct igb_adapter *adapter = netdev_priv(rx_ring->netdev); > struct device *dev = rx_ring->dev; > int size; > > @@ -4201,6 +4347,13 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) > rx_ring->next_to_clean = 0; > rx_ring->next_to_use = 0; > > + rx_ring->xdp_prog = adapter->xdp_prog; > + > + /* XDP RX-queue info */ > + if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, > + rx_ring->queue_index) < 0) > + goto err; > + > return 0; > > err: > @@ -4505,6 +4658,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, > int reg_idx = ring->reg_idx; > u32 rxdctl = 0; > > + xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); > + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, > + MEM_TYPE_PAGE_SHARED, NULL)); > + > /* disable the queue */ > wr32(E1000_RXDCTL(reg_idx), 0); > > @@ -4709,6 +4866,8 @@ void igb_free_rx_resources(struct igb_ring *rx_ring) > { > igb_clean_rx_ring(rx_ring); > > + rx_ring->xdp_prog = NULL; > + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); > vfree(rx_ring->rx_buffer_info); > rx_ring->rx_buffer_info = NULL; > > @@ -6078,6 +6237,80 @@ static int igb_tx_map(struct igb_ring *tx_ring, > return -1; > } > > +int igb_xmit_xdp_ring(struct igb_adapter *adapter, > + struct igb_ring *tx_ring, > + struct xdp_frame *xdpf) > +{ > + union e1000_adv_tx_desc *tx_desc; > + u32 len, cmd_type, olinfo_status; > + struct igb_tx_buffer *tx_buffer; > + dma_addr_t dma; > + u16 i; > + > + len = xdpf->len; > + > + if (unlikely(!igb_desc_unused(tx_ring))) > + return IGB_XDP_CONSUMED; > + > + dma = dma_map_single(tx_ring->dev, xdpf->data, len, DMA_TO_DEVICE); > + if (dma_mapping_error(tx_ring->dev, dma)) > + return IGB_XDP_CONSUMED; > + > + /* record the location of the first descriptor for this packet */ > + tx_buffer = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > + tx_buffer->bytecount = len; > + tx_buffer->gso_segs = 1; > + tx_buffer->protocol = 0; > + > + i = tx_ring->next_to_use; > + tx_desc = IGB_TX_DESC(tx_ring, i); > + > + dma_unmap_len_set(tx_buffer, len, len); > + dma_unmap_addr_set(tx_buffer, dma, dma); > + tx_buffer->type = IGB_TYPE_XDP; > + tx_buffer->xdpf = xdpf; > + > + tx_desc->read.buffer_addr = cpu_to_le64(dma); > + > + /* put descriptor type bits */ > + cmd_type = E1000_ADVTXD_DTYP_DATA | > + E1000_ADVTXD_DCMD_DEXT | > + E1000_ADVTXD_DCMD_IFCS; > + cmd_type |= len | IGB_TXD_DCMD; > + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); > + > + olinfo_status = cpu_to_le32(len << E1000_ADVTXD_PAYLEN_SHIFT); > + /* 82575 requires a unique index per ring */ > + if (test_bit(IGB_RING_FLAG_TX_CTX_IDX, &tx_ring->flags)) > + olinfo_status |= tx_ring->reg_idx << 4; > + > + tx_desc->read.olinfo_status = olinfo_status; > + > + netdev_tx_sent_queue(txring_txq(tx_ring), tx_buffer->bytecount); > + > + /* set the timestamp */ > + tx_buffer->time_stamp = jiffies; > + > + /* Avoid any potential race with xdp_xmit and cleanup */ > + smp_wmb(); > + > + /* set next_to_watch value indicating a packet is present */ > + i++; > + if (i == tx_ring->count) > + i = 0; > + > + tx_buffer->next_to_watch = tx_desc; > + tx_ring->next_to_use = i; > + > + /* Make sure there is space in the ring for the next send. */ > + igb_maybe_stop_tx(tx_ring, DESC_NEEDED); > + > + if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) > + writel(i, tx_ring->tail); > + > + return IGB_XDP_TX; > +} > + > netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, > struct igb_ring *tx_ring) > { > @@ -6106,6 +6339,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, > > /* record the location of the first descriptor for this packet */ > first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > + first->type = IGB_TYPE_SKB; > first->skb = skb; > first->bytecount = skb->len; > first->gso_segs = 1; > @@ -6257,6 +6491,19 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu) > struct igb_adapter *adapter = netdev_priv(netdev); > int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; > > + if (adapter->xdp_prog) { > + int i; > + > + for (i = 0; i < adapter->num_rx_queues; i++) { > + struct igb_ring *ring = adapter->rx_ring[i]; > + > + if (max_frame > igb_rx_bufsz(ring)) { > + netdev_warn(adapter->netdev, "Requested MTU size is not supported with XDP\n"); > + return -EINVAL; > + } > + } > + } > + > /* adjust max frame to be at least the size of a standard frame */ > if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN)) > max_frame = ETH_FRAME_LEN + ETH_FCS_LEN; > @@ -7810,7 +8057,10 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > total_packets += tx_buffer->gso_segs; > > /* free the skb */ > - napi_consume_skb(tx_buffer->skb, napi_budget); > + if (tx_buffer->type == IGB_TYPE_SKB) > + napi_consume_skb(tx_buffer->skb, napi_budget); > + else > + xdp_return_frame(tx_buffer->xdpf); > > /* unmap skb header data */ > dma_unmap_single(tx_ring->dev, > @@ -7994,8 +8244,8 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer) > * the pagecnt_bias and page count so that we fully restock the > * number of references the driver holds. > */ > - if (unlikely(!pagecnt_bias)) { > - page_ref_add(page, USHRT_MAX); > + if (unlikely(pagecnt_bias == 1)) { > + page_ref_add(page, USHRT_MAX - 1); > rx_buffer->pagecnt_bias = USHRT_MAX; > } > > @@ -8034,20 +8284,21 @@ static void igb_add_rx_frag(struct igb_ring *rx_ring, > > static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, > struct igb_rx_buffer *rx_buffer, > - union e1000_adv_rx_desc *rx_desc, > - unsigned int size) > + struct xdp_buff *xdp, > + union e1000_adv_rx_desc *rx_desc) > { > - void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; > #if (PAGE_SIZE < 8192) > unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; > #else > - unsigned int truesize = SKB_DATA_ALIGN(size); > + unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end - > + xdp->data_hard_start); > #endif > + unsigned int size = xdp->data_end - xdp->data; > unsigned int headlen; > struct sk_buff *skb; > > /* prefetch first cache line of first page */ > - net_prefetch(va); > + net_prefetch(xdp->data); > > /* allocate a skb to store the frags */ > skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN); > @@ -8055,24 +8306,24 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, > return NULL; > > if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) { > - igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); > - va += IGB_TS_HDR_LEN; > + igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb); > + xdp->data += IGB_TS_HDR_LEN; > size -= IGB_TS_HDR_LEN; > } > > /* Determine available headroom for copy */ > headlen = size; > if (headlen > IGB_RX_HDR_LEN) > - headlen = eth_get_headlen(skb->dev, va, IGB_RX_HDR_LEN); > + headlen = eth_get_headlen(skb->dev, xdp->data, IGB_RX_HDR_LEN); > > /* align pull length to size of long to optimize memcpy performance */ > - memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); > + memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, sizeof(long))); > > /* update all of the pointers */ > size -= headlen; > if (size) { > skb_add_rx_frag(skb, 0, rx_buffer->page, > - (va + headlen) - page_address(rx_buffer->page), > + (xdp->data + headlen) - page_address(rx_buffer->page), > size, truesize); > #if (PAGE_SIZE < 8192) > rx_buffer->page_offset ^= truesize; > @@ -8088,29 +8339,29 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, > > static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, > struct igb_rx_buffer *rx_buffer, > - union e1000_adv_rx_desc *rx_desc, > - unsigned int size) > + struct xdp_buff *xdp, > + union e1000_adv_rx_desc *rx_desc) > { > - void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; > #if (PAGE_SIZE < 8192) > unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; > #else > unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + > - SKB_DATA_ALIGN(IGB_SKB_PAD + size); > + SKB_DATA_ALIGN(xdp->data_end - > + xdp->data_hard_start); > #endif > struct sk_buff *skb; > > /* prefetch first cache line of first page */ > - net_prefetch(va); > + net_prefetch(xdp->data_meta); > > /* build an skb around the page buffer */ > - skb = build_skb(va - IGB_SKB_PAD, truesize); > + skb = build_skb(xdp->data_hard_start, truesize); > if (unlikely(!skb)) > return NULL; > > /* update pointers within the skb to store the data */ > - skb_reserve(skb, IGB_SKB_PAD); > - __skb_put(skb, size); > + skb_reserve(skb, xdp->data - xdp->data_hard_start); > + __skb_put(skb, xdp->data_end - xdp->data); > > /* pull timestamp out of packet data */ > if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { > @@ -8128,6 +8379,79 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, > return skb; > } > > +static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, > + struct igb_ring *rx_ring, > + struct xdp_buff *xdp) > +{ > + int err, result = IGB_XDP_PASS; > + struct bpf_prog *xdp_prog; > + u32 act; > + > + rcu_read_lock(); > + xdp_prog = READ_ONCE(rx_ring->xdp_prog); > + > + if (!xdp_prog) > + goto xdp_out; > + > + prefetchw(xdp->data_hard_start); /* xdp_frame write */ > + > + act = bpf_prog_run_xdp(xdp_prog, xdp); > + switch (act) { > + case XDP_PASS: > + break; > + case XDP_TX: > + result = igb_xdp_xmit_back(adapter, xdp); > + break; > + case XDP_REDIRECT: > + err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); > + if (!err) > + result = IGB_XDP_REDIR; > + else > + result = IGB_XDP_CONSUMED; > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + fallthrough; > + case XDP_ABORTED: > + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); > + fallthrough; > + case XDP_DROP: > + result = IGB_XDP_CONSUMED; > + break; > + } > +xdp_out: > + rcu_read_unlock(); > + return ERR_PTR(-result); > +} > + > +static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring, > + unsigned int size) > +{ > + unsigned int truesize; > + > +#if (PAGE_SIZE < 8192) > + truesize = igb_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ > +#else > + truesize = ring_uses_build_skb(rx_ring) ? > + SKB_DATA_ALIGN(IGB_SKB_PAD + size) + > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : > + SKB_DATA_ALIGN(size); > +#endif > + return truesize; > +} > + > +static void igb_rx_buffer_flip(struct igb_ring *rx_ring, > + struct igb_rx_buffer *rx_buffer, > + unsigned int size) > +{ > + unsigned int truesize = igb_rx_frame_truesize(rx_ring, size); > +#if (PAGE_SIZE < 8192) > + rx_buffer->page_offset ^= truesize; > +#else > + rx_buffer->page_offset += truesize; > +#endif > +} > + > static inline void igb_rx_checksum(struct igb_ring *ring, > union e1000_adv_rx_desc *rx_desc, > struct sk_buff *skb) > @@ -8224,6 +8548,10 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring, > union e1000_adv_rx_desc *rx_desc, > struct sk_buff *skb) > { > + /* XDP packets use error pointer so abort at this point */ > + if (IS_ERR(skb)) > + return true; > + > if (unlikely((igb_test_staterr(rx_desc, > E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) { > struct net_device *netdev = rx_ring->netdev; > @@ -8282,6 +8610,11 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring, > skb->protocol = eth_type_trans(skb, rx_ring->netdev); > } > > +static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring) > +{ > + return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0; > +} > + > static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring, > const unsigned int size) > { > @@ -8325,10 +8658,20 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring, > > static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) > { > + struct igb_adapter *adapter = q_vector->adapter; > struct igb_ring *rx_ring = q_vector->rx.ring; > struct sk_buff *skb = rx_ring->skb; > unsigned int total_bytes = 0, total_packets = 0; > u16 cleaned_count = igb_desc_unused(rx_ring); > + unsigned int xdp_xmit = 0; > + struct xdp_buff xdp; > + > + xdp.rxq = &rx_ring->xdp_rxq; > + > + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ > +#if (PAGE_SIZE < 8192) > + xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0); > +#endif > > while (likely(total_packets < budget)) { > union e1000_adv_rx_desc *rx_desc; > @@ -8355,13 +8698,38 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) > rx_buffer = igb_get_rx_buffer(rx_ring, size); > > /* retrieve a buffer from the ring */ > - if (skb) > + if (!skb) { > + xdp.data = page_address(rx_buffer->page) + > + rx_buffer->page_offset; > + xdp.data_meta = xdp.data; > + xdp.data_hard_start = xdp.data - > + igb_rx_offset(rx_ring); > + xdp.data_end = xdp.data + size; > +#if (PAGE_SIZE > 4096) > + /* At larger PAGE_SIZE, frame_sz depend on len size */ > + xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size); > +#endif > + skb = igb_run_xdp(adapter, rx_ring, &xdp); > + } > + > + if (IS_ERR(skb)) { > + unsigned int xdp_res = -PTR_ERR(skb); > + > + if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) { > + xdp_xmit |= xdp_res; > + igb_rx_buffer_flip(rx_ring, rx_buffer, size); > + } else { > + rx_buffer->pagecnt_bias++; > + } > + total_packets++; > + total_bytes += size; > + } else if (skb) > igb_add_rx_frag(rx_ring, rx_buffer, skb, size); > else if (ring_uses_build_skb(rx_ring)) > - skb = igb_build_skb(rx_ring, rx_buffer, rx_desc, size); > + skb = igb_build_skb(rx_ring, rx_buffer, &xdp, rx_desc); > else > skb = igb_construct_skb(rx_ring, rx_buffer, > - rx_desc, size); > + &xdp, rx_desc); > > /* exit if we failed to retrieve a buffer */ > if (!skb) { > @@ -8401,6 +8769,15 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) > /* place incomplete frames back on ring for completion */ > rx_ring->skb = skb; > > + if (xdp_xmit & IGB_XDP_REDIR) > + xdp_do_flush_map(); > + > + if (xdp_xmit & IGB_XDP_TX) { > + struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); > + > + igb_xdp_ring_update_tail(tx_ring); > + } > + > u64_stats_update_begin(&rx_ring->rx_syncp); > rx_ring->rx_stats.packets += total_packets; > rx_ring->rx_stats.bytes += total_bytes; > @@ -8414,11 +8791,6 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) > return total_packets; > } > > -static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring) > -{ > - return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0; > -} > - > static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, > struct igb_rx_buffer *bi) > { > @@ -8455,7 +8827,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, > bi->dma = dma; > bi->page = page; > bi->page_offset = igb_rx_offset(rx_ring); > - bi->pagecnt_bias = 1; > + page_ref_add(page, USHRT_MAX - 1); > + bi->pagecnt_bias = USHRT_MAX; > > return true; > } > -- > 2.26.2 >
On Mon, 2020-09-21 at 11:58 +0200, Sven Auhagen wrote: > On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote: > > From: Sven Auhagen <Sven.Auhagen@voleatech.de> > > > > Add XDP support to the IGB driver. > > The implementation follows the IXGBE XDP implementation > > closely and I used the following patches as basis: > > > > 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop > > actions") > > 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX action") > > 3. commit ed93a3987128 ("ixgbe: tweak page counting for > > XDP_REDIRECT") > > > > Due to the hardware constraints of the devices using the > > IGB driver we must share the TX queues with XDP which > > means locking the TX queue for XDP. > > > > I ran tests on an older device to get better numbers. > > Test machine: > > > > Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores) > > 2x Intel I211 > > > > Routing Original Driver Network Stack: 382 Kpps > > > > Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps > > XDP Drop: 1.48 Mpps > > > > Using XDP we can achieve line rate forwarding even on > > an older Intel Atom CPU. > > > > Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de> > > Hello Tony, > > thanks for the patch update. > How is the process to get this accepted? > Hi Sven, I've been waiting for validation to test this. When I get word that everything is ok, I'll send the patch to net-next along with other 1Gb changes. Thanks, Tony
From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Sven Auhagen Sent: Monday, September 21, 2020 3:28 PM To: Nguyen, Anthony L <anthony.l.nguyen@intel.com> Cc: intel-wired-lan@lists.osuosl.org Subject: Re: [Intel-wired-lan] [next-queue v7] igb: add XDP support On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote: > From: Sven Auhagen <Sven.Auhagen@voleatech.de> > > Add XDP support to the IGB driver. > The implementation follows the IXGBE XDP implementation closely and I > used the following patches as basis: > > 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop > actions") 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX > action") 3. commit ed93a3987128 ("ixgbe: tweak page counting for > XDP_REDIRECT") > > Due to the hardware constraints of the devices using the IGB driver we > must share the TX queues with XDP which means locking the TX queue for > XDP. > > I ran tests on an older device to get better numbers. > Test machine: > > Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores) 2x Intel I211 > > Routing Original Driver Network Stack: 382 Kpps > > Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps XDP Drop: 1.48 Mpps > > Using XDP we can achieve line rate forwarding even on an older Intel > Atom CPU. > > Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de> Hello Tony, thanks for the patch update. How is the process to get this accepted? Best Sven > --- > v7: > * Fix issue with applying to dev-queue branch; utilize net_prefetch() > * Fix build issue; remove XDP_QUERY_PROG > * Replace fallthrough comment with fallthrough macro to resolve > checkpatch warning > * Fix reverse Christmas tree > > v6: > * igb_xdp_ring_update_tail changed to static > * bump to 5.8 > > v5: resubmission with function names in patch > > v4: > * use HARD_TX_LOCK in XDP xmit > * do not pass adapter to igb_setup_rx_resources > * account for timestamp in frame size > > v3: igb_xdp_ring_update_tail should be static > > v2: original did not apply to my dev-queue branch, so fixed the > conflicts in the patch > > drivers/net/ethernet/intel/igb/igb.h | 81 +++- > drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 + > drivers/net/ethernet/intel/igb/igb_main.c | 433 +++++++++++++++++-- > 3 files changed, 482 insertions(+), 36 deletions(-) > Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com>
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 2f015b60a995..fa6ff1a64fc0 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -19,6 +19,8 @@ #include <linux/pci.h> #include <linux/mdio.h> +#include <net/xdp.h> + struct igb_adapter; #define E1000_PCS_CFG_IGN_SD 1 @@ -79,6 +81,12 @@ struct igb_adapter; #define IGB_I210_RX_LATENCY_100 2213 #define IGB_I210_RX_LATENCY_1000 448 +/* XDP */ +#define IGB_XDP_PASS 0 +#define IGB_XDP_CONSUMED BIT(0) +#define IGB_XDP_TX BIT(1) +#define IGB_XDP_REDIR BIT(2) + struct vf_data_storage { unsigned char vf_mac_addresses[ETH_ALEN]; u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES]; @@ -132,17 +140,63 @@ struct vf_mac_filter { /* Supported Rx Buffer Sizes */ #define IGB_RXBUFFER_256 256 +#define IGB_RXBUFFER_1536 1536 #define IGB_RXBUFFER_2048 2048 #define IGB_RXBUFFER_3072 3072 #define IGB_RX_HDR_LEN IGB_RXBUFFER_256 #define IGB_TS_HDR_LEN 16 -#define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) +/* Attempt to maximize the headroom available for incoming frames. We + * use a 2K buffer for receives and need 1536/1534 to store the data for + * the frame. This leaves us with 512 bytes of room. From that we need + * to deduct the space needed for the shared info and the padding needed + * to IP align the frame. + * + * Note: For cache line sizes 256 or larger this value is going to end + * up negative. In these cases we should fall back to the 3K + * buffers. + */ #if (PAGE_SIZE < 8192) -#define IGB_MAX_FRAME_BUILD_SKB \ - (SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048) - IGB_SKB_PAD - IGB_TS_HDR_LEN) +#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_1536 - NET_IP_ALIGN) +#define IGB_2K_TOO_SMALL_WITH_PADDING \ +((NET_SKB_PAD + IGB_TS_HDR_LEN + IGB_RXBUFFER_1536) > \ +SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048)) + +static inline int igb_compute_pad(int rx_buf_len) +{ + int page_size, pad_size; + + page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2); + pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len; + + return pad_size; +} + +static inline int igb_skb_pad(void) +{ + int rx_buf_len; + + /* If a 2K buffer cannot handle a standard Ethernet frame then + * optimize padding for a 3K buffer instead of a 1.5K buffer. + * + * For a 3K buffer we need to add enough padding to allow for + * tailroom due to NET_IP_ALIGN possibly shifting us out of + * cache-line alignment. + */ + if (IGB_2K_TOO_SMALL_WITH_PADDING) + rx_buf_len = IGB_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN); + else + rx_buf_len = IGB_RXBUFFER_1536; + + /* if needed make room for NET_IP_ALIGN */ + rx_buf_len -= NET_IP_ALIGN; + + return igb_compute_pad(rx_buf_len); +} + +#define IGB_SKB_PAD igb_skb_pad() #else -#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_2048 - IGB_TS_HDR_LEN) +#define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) #endif /* How many Rx Buffers do we bundle into one write to the hardware ? */ @@ -194,13 +248,22 @@ enum igb_tx_flags { #define IGB_SFF_ADDRESSING_MODE 0x4 #define IGB_SFF_8472_UNSUP 0x00 +enum igb_tx_buf_type { + IGB_TYPE_SKB = 0, + IGB_TYPE_XDP, +}; + /* wrapper around a pointer to a socket buffer, * so a DMA handle can be stored along with the buffer */ struct igb_tx_buffer { union e1000_adv_tx_desc *next_to_watch; unsigned long time_stamp; - struct sk_buff *skb; + enum igb_tx_buf_type type; + union { + struct sk_buff *skb; + struct xdp_frame *xdpf; + }; unsigned int bytecount; u16 gso_segs; __be16 protocol; @@ -248,6 +311,7 @@ struct igb_ring_container { struct igb_ring { struct igb_q_vector *q_vector; /* backlink to q_vector */ struct net_device *netdev; /* back pointer to net_device */ + struct bpf_prog *xdp_prog; struct device *dev; /* device pointer for dma mapping */ union { /* array of buffer info structs */ struct igb_tx_buffer *tx_buffer_info; @@ -288,6 +352,7 @@ struct igb_ring { struct u64_stats_sync rx_syncp; }; }; + struct xdp_rxq_info xdp_rxq; } ____cacheline_internodealigned_in_smp; struct igb_q_vector { @@ -339,7 +404,7 @@ static inline unsigned int igb_rx_bufsz(struct igb_ring *ring) return IGB_RXBUFFER_3072; if (ring_uses_build_skb(ring)) - return IGB_MAX_FRAME_BUILD_SKB + IGB_TS_HDR_LEN; + return IGB_MAX_FRAME_BUILD_SKB; #endif return IGB_RXBUFFER_2048; } @@ -467,6 +532,7 @@ struct igb_adapter { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; struct net_device *netdev; + struct bpf_prog *xdp_prog; unsigned long state; unsigned int flags; @@ -643,6 +709,9 @@ enum igb_boards { extern char igb_driver_name[]; +int igb_xmit_xdp_ring(struct igb_adapter *adapter, + struct igb_ring *ring, + struct xdp_frame *xdpf); int igb_open(struct net_device *netdev); int igb_close(struct net_device *netdev); int igb_up(struct igb_adapter *); diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 6e8231c1ddf0..28baf203459a 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -961,6 +961,10 @@ static int igb_set_ringparam(struct net_device *netdev, memcpy(&temp_ring[i], adapter->rx_ring[i], sizeof(struct igb_ring)); + /* Clear copied XDP RX-queue info */ + memset(&temp_ring[i].xdp_rxq, 0, + sizeof(temp_ring[i].xdp_rxq)); + temp_ring[i].count = new_rx_count; err = igb_setup_rx_resources(&temp_ring[i]); if (err) { diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 698bb6a4b088..73635a012f4a 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -30,6 +30,8 @@ #include <linux/if_ether.h> #include <linux/aer.h> #include <linux/prefetch.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/pm_runtime.h> #include <linux/etherdevice.h> #ifdef CONFIG_IGB_DCA @@ -2825,6 +2827,147 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, } } +static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog) +{ + int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + struct igb_adapter *adapter = netdev_priv(dev); + bool running = netif_running(dev); + struct bpf_prog *old_prog; + bool need_reset; + + /* verify igb ring attributes are sufficient for XDP */ + for (i = 0; i < adapter->num_rx_queues; i++) { + struct igb_ring *ring = adapter->rx_ring[i]; + + if (frame_size > igb_rx_bufsz(ring)) + return -EINVAL; + } + + old_prog = xchg(&adapter->xdp_prog, prog); + need_reset = (!!prog != !!old_prog); + + /* device is up and bpf is added/removed, must setup the RX queues */ + if (need_reset && running) { + igb_close(dev); + } else { + for (i = 0; i < adapter->num_rx_queues; i++) + (void)xchg(&adapter->rx_ring[i]->xdp_prog, + adapter->xdp_prog); + } + + if (old_prog) + bpf_prog_put(old_prog); + + /* bpf is just replaced, RXQ and MTU are already setup */ + if (!need_reset) + return 0; + + if (running) + igb_open(dev); + + return 0; +} + +static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return igb_xdp_setup(dev, xdp->prog); + default: + return -EINVAL; + } +} + +static void igb_xdp_ring_update_tail(struct igb_ring *ring) +{ + /* Force memory writes to complete before letting h/w know there + * are new descriptors to fetch. + */ + wmb(); + writel(ring->next_to_use, ring->tail); +} + +static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter) +{ + unsigned int r_idx = smp_processor_id(); + + if (r_idx >= adapter->num_tx_queues) + r_idx = r_idx % adapter->num_tx_queues; + + return adapter->tx_ring[r_idx]; +} + +static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) +{ + struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); + int cpu = smp_processor_id(); + struct igb_ring *tx_ring; + struct netdev_queue *nq; + u32 ret; + + if (unlikely(!xdpf)) + return IGB_XDP_CONSUMED; + + /* During program transitions its possible adapter->xdp_prog is assigned + * but ring has not been configured yet. In this case simply abort xmit. + */ + tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; + if (unlikely(!tx_ring)) + return -ENXIO; + + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); + ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); + __netif_tx_unlock(nq); + + return ret; +} + +static int igb_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct igb_adapter *adapter = netdev_priv(dev); + int cpu = smp_processor_id(); + struct igb_ring *tx_ring; + struct netdev_queue *nq; + int drops = 0; + int i; + + if (unlikely(test_bit(__IGB_DOWN, &adapter->state))) + return -ENETDOWN; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + /* During program transitions its possible adapter->xdp_prog is assigned + * but ring has not been configured yet. In this case simply abort xmit. + */ + tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; + if (unlikely(!tx_ring)) + return -ENXIO; + + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + int err; + + err = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); + if (err != IGB_XDP_TX) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + + __netif_tx_unlock(nq); + + if (unlikely(flags & XDP_XMIT_FLUSH)) + igb_xdp_ring_update_tail(tx_ring); + + return n - drops; +} + static const struct net_device_ops igb_netdev_ops = { .ndo_open = igb_open, .ndo_stop = igb_close, @@ -2849,6 +2992,8 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_fdb_add = igb_ndo_fdb_add, .ndo_features_check = igb_features_check, .ndo_setup_tc = igb_setup_tc, + .ndo_bpf = igb_xdp, + .ndo_xdp_xmit = igb_xdp_xmit, }; /** @@ -4179,6 +4324,7 @@ static void igb_configure_tx(struct igb_adapter *adapter) **/ int igb_setup_rx_resources(struct igb_ring *rx_ring) { + struct igb_adapter *adapter = netdev_priv(rx_ring->netdev); struct device *dev = rx_ring->dev; int size; @@ -4201,6 +4347,13 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; + rx_ring->xdp_prog = adapter->xdp_prog; + + /* XDP RX-queue info */ + if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, + rx_ring->queue_index) < 0) + goto err; + return 0; err: @@ -4505,6 +4658,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, int reg_idx = ring->reg_idx; u32 rxdctl = 0; + xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL)); + /* disable the queue */ wr32(E1000_RXDCTL(reg_idx), 0); @@ -4709,6 +4866,8 @@ void igb_free_rx_resources(struct igb_ring *rx_ring) { igb_clean_rx_ring(rx_ring); + rx_ring->xdp_prog = NULL; + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); vfree(rx_ring->rx_buffer_info); rx_ring->rx_buffer_info = NULL; @@ -6078,6 +6237,80 @@ static int igb_tx_map(struct igb_ring *tx_ring, return -1; } +int igb_xmit_xdp_ring(struct igb_adapter *adapter, + struct igb_ring *tx_ring, + struct xdp_frame *xdpf) +{ + union e1000_adv_tx_desc *tx_desc; + u32 len, cmd_type, olinfo_status; + struct igb_tx_buffer *tx_buffer; + dma_addr_t dma; + u16 i; + + len = xdpf->len; + + if (unlikely(!igb_desc_unused(tx_ring))) + return IGB_XDP_CONSUMED; + + dma = dma_map_single(tx_ring->dev, xdpf->data, len, DMA_TO_DEVICE); + if (dma_mapping_error(tx_ring->dev, dma)) + return IGB_XDP_CONSUMED; + + /* record the location of the first descriptor for this packet */ + tx_buffer = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + tx_buffer->bytecount = len; + tx_buffer->gso_segs = 1; + tx_buffer->protocol = 0; + + i = tx_ring->next_to_use; + tx_desc = IGB_TX_DESC(tx_ring, i); + + dma_unmap_len_set(tx_buffer, len, len); + dma_unmap_addr_set(tx_buffer, dma, dma); + tx_buffer->type = IGB_TYPE_XDP; + tx_buffer->xdpf = xdpf; + + tx_desc->read.buffer_addr = cpu_to_le64(dma); + + /* put descriptor type bits */ + cmd_type = E1000_ADVTXD_DTYP_DATA | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_IFCS; + cmd_type |= len | IGB_TXD_DCMD; + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + + olinfo_status = cpu_to_le32(len << E1000_ADVTXD_PAYLEN_SHIFT); + /* 82575 requires a unique index per ring */ + if (test_bit(IGB_RING_FLAG_TX_CTX_IDX, &tx_ring->flags)) + olinfo_status |= tx_ring->reg_idx << 4; + + tx_desc->read.olinfo_status = olinfo_status; + + netdev_tx_sent_queue(txring_txq(tx_ring), tx_buffer->bytecount); + + /* set the timestamp */ + tx_buffer->time_stamp = jiffies; + + /* Avoid any potential race with xdp_xmit and cleanup */ + smp_wmb(); + + /* set next_to_watch value indicating a packet is present */ + i++; + if (i == tx_ring->count) + i = 0; + + tx_buffer->next_to_watch = tx_desc; + tx_ring->next_to_use = i; + + /* Make sure there is space in the ring for the next send. */ + igb_maybe_stop_tx(tx_ring, DESC_NEEDED); + + if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) + writel(i, tx_ring->tail); + + return IGB_XDP_TX; +} + netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, struct igb_ring *tx_ring) { @@ -6106,6 +6339,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + first->type = IGB_TYPE_SKB; first->skb = skb; first->bytecount = skb->len; first->gso_segs = 1; @@ -6257,6 +6491,19 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu) struct igb_adapter *adapter = netdev_priv(netdev); int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + if (adapter->xdp_prog) { + int i; + + for (i = 0; i < adapter->num_rx_queues; i++) { + struct igb_ring *ring = adapter->rx_ring[i]; + + if (max_frame > igb_rx_bufsz(ring)) { + netdev_warn(adapter->netdev, "Requested MTU size is not supported with XDP\n"); + return -EINVAL; + } + } + } + /* adjust max frame to be at least the size of a standard frame */ if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN)) max_frame = ETH_FRAME_LEN + ETH_FCS_LEN; @@ -7810,7 +8057,10 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) total_packets += tx_buffer->gso_segs; /* free the skb */ - napi_consume_skb(tx_buffer->skb, napi_budget); + if (tx_buffer->type == IGB_TYPE_SKB) + napi_consume_skb(tx_buffer->skb, napi_budget); + else + xdp_return_frame(tx_buffer->xdpf); /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -7994,8 +8244,8 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer) * the pagecnt_bias and page count so that we fully restock the * number of references the driver holds. */ - if (unlikely(!pagecnt_bias)) { - page_ref_add(page, USHRT_MAX); + if (unlikely(pagecnt_bias == 1)) { + page_ref_add(page, USHRT_MAX - 1); rx_buffer->pagecnt_bias = USHRT_MAX; } @@ -8034,20 +8284,21 @@ static void igb_add_rx_frag(struct igb_ring *rx_ring, static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, struct igb_rx_buffer *rx_buffer, - union e1000_adv_rx_desc *rx_desc, - unsigned int size) + struct xdp_buff *xdp, + union e1000_adv_rx_desc *rx_desc) { - void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; #if (PAGE_SIZE < 8192) unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; #else - unsigned int truesize = SKB_DATA_ALIGN(size); + unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end - + xdp->data_hard_start); #endif + unsigned int size = xdp->data_end - xdp->data; unsigned int headlen; struct sk_buff *skb; /* prefetch first cache line of first page */ - net_prefetch(va); + net_prefetch(xdp->data); /* allocate a skb to store the frags */ skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN); @@ -8055,24 +8306,24 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, return NULL; if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) { - igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); - va += IGB_TS_HDR_LEN; + igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb); + xdp->data += IGB_TS_HDR_LEN; size -= IGB_TS_HDR_LEN; } /* Determine available headroom for copy */ headlen = size; if (headlen > IGB_RX_HDR_LEN) - headlen = eth_get_headlen(skb->dev, va, IGB_RX_HDR_LEN); + headlen = eth_get_headlen(skb->dev, xdp->data, IGB_RX_HDR_LEN); /* align pull length to size of long to optimize memcpy performance */ - memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); + memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, sizeof(long))); /* update all of the pointers */ size -= headlen; if (size) { skb_add_rx_frag(skb, 0, rx_buffer->page, - (va + headlen) - page_address(rx_buffer->page), + (xdp->data + headlen) - page_address(rx_buffer->page), size, truesize); #if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; @@ -8088,29 +8339,29 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, struct igb_rx_buffer *rx_buffer, - union e1000_adv_rx_desc *rx_desc, - unsigned int size) + struct xdp_buff *xdp, + union e1000_adv_rx_desc *rx_desc) { - void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; #if (PAGE_SIZE < 8192) unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; #else unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + - SKB_DATA_ALIGN(IGB_SKB_PAD + size); + SKB_DATA_ALIGN(xdp->data_end - + xdp->data_hard_start); #endif struct sk_buff *skb; /* prefetch first cache line of first page */ - net_prefetch(va); + net_prefetch(xdp->data_meta); /* build an skb around the page buffer */ - skb = build_skb(va - IGB_SKB_PAD, truesize); + skb = build_skb(xdp->data_hard_start, truesize); if (unlikely(!skb)) return NULL; /* update pointers within the skb to store the data */ - skb_reserve(skb, IGB_SKB_PAD); - __skb_put(skb, size); + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); /* pull timestamp out of packet data */ if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { @@ -8128,6 +8379,79 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, return skb; } +static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, + struct igb_ring *rx_ring, + struct xdp_buff *xdp) +{ + int err, result = IGB_XDP_PASS; + struct bpf_prog *xdp_prog; + u32 act; + + rcu_read_lock(); + xdp_prog = READ_ONCE(rx_ring->xdp_prog); + + if (!xdp_prog) + goto xdp_out; + + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + result = igb_xdp_xmit_back(adapter, xdp); + break; + case XDP_REDIRECT: + err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); + if (!err) + result = IGB_XDP_REDIR; + else + result = IGB_XDP_CONSUMED; + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); + fallthrough; + case XDP_DROP: + result = IGB_XDP_CONSUMED; + break; + } +xdp_out: + rcu_read_unlock(); + return ERR_PTR(-result); +} + +static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = igb_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IGB_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + +static void igb_rx_buffer_flip(struct igb_ring *rx_ring, + struct igb_rx_buffer *rx_buffer, + unsigned int size) +{ + unsigned int truesize = igb_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) + rx_buffer->page_offset ^= truesize; +#else + rx_buffer->page_offset += truesize; +#endif +} + static inline void igb_rx_checksum(struct igb_ring *ring, union e1000_adv_rx_desc *rx_desc, struct sk_buff *skb) @@ -8224,6 +8548,10 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring, union e1000_adv_rx_desc *rx_desc, struct sk_buff *skb) { + /* XDP packets use error pointer so abort at this point */ + if (IS_ERR(skb)) + return true; + if (unlikely((igb_test_staterr(rx_desc, E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) { struct net_device *netdev = rx_ring->netdev; @@ -8282,6 +8610,11 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring, skb->protocol = eth_type_trans(skb, rx_ring->netdev); } +static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring) +{ + return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0; +} + static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring, const unsigned int size) { @@ -8325,10 +8658,20 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring, static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) { + struct igb_adapter *adapter = q_vector->adapter; struct igb_ring *rx_ring = q_vector->rx.ring; struct sk_buff *skb = rx_ring->skb; unsigned int total_bytes = 0, total_packets = 0; u16 cleaned_count = igb_desc_unused(rx_ring); + unsigned int xdp_xmit = 0; + struct xdp_buff xdp; + + xdp.rxq = &rx_ring->xdp_rxq; + + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0); +#endif while (likely(total_packets < budget)) { union e1000_adv_rx_desc *rx_desc; @@ -8355,13 +8698,38 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) rx_buffer = igb_get_rx_buffer(rx_ring, size); /* retrieve a buffer from the ring */ - if (skb) + if (!skb) { + xdp.data = page_address(rx_buffer->page) + + rx_buffer->page_offset; + xdp.data_meta = xdp.data; + xdp.data_hard_start = xdp.data - + igb_rx_offset(rx_ring); + xdp.data_end = xdp.data + size; +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size); +#endif + skb = igb_run_xdp(adapter, rx_ring, &xdp); + } + + if (IS_ERR(skb)) { + unsigned int xdp_res = -PTR_ERR(skb); + + if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) { + xdp_xmit |= xdp_res; + igb_rx_buffer_flip(rx_ring, rx_buffer, size); + } else { + rx_buffer->pagecnt_bias++; + } + total_packets++; + total_bytes += size; + } else if (skb) igb_add_rx_frag(rx_ring, rx_buffer, skb, size); else if (ring_uses_build_skb(rx_ring)) - skb = igb_build_skb(rx_ring, rx_buffer, rx_desc, size); + skb = igb_build_skb(rx_ring, rx_buffer, &xdp, rx_desc); else skb = igb_construct_skb(rx_ring, rx_buffer, - rx_desc, size); + &xdp, rx_desc); /* exit if we failed to retrieve a buffer */ if (!skb) { @@ -8401,6 +8769,15 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* place incomplete frames back on ring for completion */ rx_ring->skb = skb; + if (xdp_xmit & IGB_XDP_REDIR) + xdp_do_flush_map(); + + if (xdp_xmit & IGB_XDP_TX) { + struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); + + igb_xdp_ring_update_tail(tx_ring); + } + u64_stats_update_begin(&rx_ring->rx_syncp); rx_ring->rx_stats.packets += total_packets; rx_ring->rx_stats.bytes += total_bytes; @@ -8414,11 +8791,6 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) return total_packets; } -static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring) -{ - return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0; -} - static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, struct igb_rx_buffer *bi) { @@ -8455,7 +8827,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, bi->dma = dma; bi->page = page; bi->page_offset = igb_rx_offset(rx_ring); - bi->pagecnt_bias = 1; + page_ref_add(page, USHRT_MAX - 1); + bi->pagecnt_bias = USHRT_MAX; return true; }