diff mbox series

[bpf-next,v5,5/6] net: pass net argument to the eth_get_headlen

Message ID 20190415173801.257254-6-sdf@google.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series net: flow_dissector: trigger BPF hook when called from eth_get_headlen | expand

Commit Message

Stanislav Fomichev April 15, 2019, 5:38 p.m. UTC
Update all users eth_get_headlen to pass network namespace
and pass it down to the flow dissector. This commit is a noop
until administrator inserts BPF flow dissector program.

Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
Cc: Salil Mehta <salil.mehta@huawei.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Cc: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c  | 3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c     | 3 ++-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c   | 3 ++-
 drivers/net/ethernet/intel/fm10k/fm10k_main.c     | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c       | 3 ++-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c       | 3 ++-
 drivers/net/ethernet/intel/ice/ice_txrx.c         | 3 ++-
 drivers/net/ethernet/intel/igb/igb_main.c         | 3 ++-
 drivers/net/ethernet/intel/igc/igc_main.c         | 3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 3 ++-
 drivers/net/tun.c                                 | 3 ++-
 include/linux/etherdevice.h                       | 2 +-
 net/ethernet/eth.c                                | 5 +++--
 16 files changed, 29 insertions(+), 17 deletions(-)

Comments

Alexei Starovoitov April 19, 2019, 12:28 a.m. UTC | #1
On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> Update all users eth_get_headlen to pass network namespace
> and pass it down to the flow dissector. This commit is a noop
> until administrator inserts BPF flow dissector program.
> 
> Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> Cc: Saeed Mahameed <saeedm@mellanox.com>
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Cc: intel-wired-lan@lists.osuosl.org
> Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> Cc: Salil Mehta <salil.mehta@huawei.com>
> Cc: Michael Chan <michael.chan@broadcom.com>
> Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.c  | 3 ++-
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 2 +-
>  drivers/net/ethernet/hisilicon/hns/hns_enet.c     | 3 ++-
>  drivers/net/ethernet/hisilicon/hns3/hns3_enet.c   | 3 ++-
>  drivers/net/ethernet/intel/fm10k/fm10k_main.c     | 2 +-
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c       | 3 ++-
>  drivers/net/ethernet/intel/iavf/iavf_txrx.c       | 3 ++-
>  drivers/net/ethernet/intel/ice/ice_txrx.c         | 3 ++-
>  drivers/net/ethernet/intel/igb/igb_main.c         | 3 ++-
>  drivers/net/ethernet/intel/igc/igc_main.c         | 3 ++-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 2 +-
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 ++-
>  drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 3 ++-
>  drivers/net/tun.c                                 | 3 ++-
>  include/linux/etherdevice.h                       | 2 +-
>  net/ethernet/eth.c                                | 5 +++--
>  16 files changed, 29 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> index c64e2fb5a4f1..1b3181f757b7 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> @@ -354,7 +354,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
>  
>  			hdr_len = buff->len;
>  			if (hdr_len > AQ_CFG_RX_HDR_SIZE)
> -				hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata),
> +				hdr_len = eth_get_headlen(dev_net(skb->dev),
> +							  aq_buf_vaddr(&buff->rxdata),
>  							  AQ_CFG_RX_HDR_SIZE);
>  
>  			memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 6528a597367b..8bb5f708ccc6 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -899,7 +899,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
>  			     DMA_ATTR_WEAK_ORDERING);
>  
>  	if (unlikely(!payload))
> -		payload = eth_get_headlen(data_ptr, len);
> +		payload = eth_get_headlen(dev_net(bp->dev), data_ptr, len);
>  
>  	skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
>  	if (!skb) {
> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> index 297b95c1b3c1..f1ecc78d2323 100644
> --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> @@ -598,7 +598,8 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
>  	} else {
>  		ring->stats.seg_pkt_cnt++;
>  
> -		pull_len = eth_get_headlen(va, HNS_RX_HEAD_SIZE);
> +		pull_len = eth_get_headlen(dev_net(ndev), va,
> +					   HNS_RX_HEAD_SIZE);
>  		memcpy(__skb_put(skb, pull_len), va,
>  		       ALIGN(pull_len, sizeof(long)));
>  
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> index b53b0911ec24..423d9ce0f6f8 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> @@ -2457,7 +2457,8 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, int length,
>  	ring->stats.seg_pkt_cnt++;
>  	u64_stats_update_end(&ring->syncp);
>  
> -	ring->pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE);
> +	ring->pull_len = eth_get_headlen(dev_net(netdev), va,
> +					 HNS3_RX_HEAD_SIZE);
>  	__skb_put(skb, ring->pull_len);
>  	hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
>  			    desc_cb);
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> index 2325cee76211..e2bee187d652 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> @@ -280,7 +280,7 @@ static bool fm10k_add_rx_frag(struct fm10k_rx_buffer *rx_buffer,
>  	/* we need the header to contain the greater of either ETH_HLEN or
>  	 * 60 bytes if the skb->len is less than 60 for skb_pad.
>  	 */
> -	pull_len = eth_get_headlen(va, FM10K_RX_HDR_LEN);
> +	pull_len = eth_get_headlen(dev_net(skb->dev), va, FM10K_RX_HDR_LEN);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 1a95223c9f99..85c5b503e0a0 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -2035,7 +2035,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > I40E_RX_HDR_SIZE)
> -		headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
> +		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
> +					  I40E_RX_HDR_SIZE);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), xdp->data,
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> index b64187753ad6..23a62d7d0f9f 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> @@ -1315,7 +1315,8 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > IAVF_RX_HDR_SIZE)
> -		headlen = eth_get_headlen(va, IAVF_RX_HDR_SIZE);
> +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> +					  IAVF_RX_HDR_SIZE);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> index a6f7b7feaf3c..2692b9333055 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> @@ -698,7 +698,8 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > ICE_RX_HDR_SIZE)
> -		headlen = eth_get_headlen(va, ICE_RX_HDR_SIZE);
> +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> +					  ICE_RX_HDR_SIZE);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index acbb5b4f333d..2023e1800c8d 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -8051,7 +8051,8 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > IGB_RX_HDR_LEN)
> -		headlen = eth_get_headlen(va, IGB_RX_HDR_LEN);
> +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> +					  IGB_RX_HDR_LEN);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index f79728381e8a..265a9d8a8421 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -1199,7 +1199,8 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > IGC_RX_HDR_LEN)
> -		headlen = eth_get_headlen(va, IGC_RX_HDR_LEN);
> +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> +					  IGC_RX_HDR_LEN);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 60cec3540dd7..5e5294567ca1 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -1800,7 +1800,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
>  	 * we need the header to contain the greater of either ETH_HLEN or
>  	 * 60 bytes if the skb->len is less than 60 for skb_pad.
>  	 */
> -	pull_len = eth_get_headlen(va, IXGBE_RX_HDR_SIZE);
> +	pull_len = eth_get_headlen(dev_net(skb->dev), va, IXGBE_RX_HDR_SIZE);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index 49e23afa05a2..252fe0de6b56 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -895,7 +895,8 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > IXGBEVF_RX_HDR_SIZE)
> -		headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE);
> +		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
> +					  IXGBEVF_RX_HDR_SIZE);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
>  	memcpy(__skb_put(skb, headlen), xdp->data,
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> index 40f3f98aa279..efcc27756c7e 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> @@ -163,7 +163,8 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
>  	case MLX5_INLINE_MODE_NONE:
>  		return 0;
>  	case MLX5_INLINE_MODE_TCP_UDP:
> -		hlen = eth_get_headlen(skb->data, skb_headlen(skb));
> +		hlen = eth_get_headlen(dev_net(skb->dev), skb->data,
> +				       skb_headlen(skb));
>  		if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
>  			hlen += VLAN_HLEN;
>  		break;
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 24d0220b9ba0..6d5c8ecfea1e 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1965,7 +1965,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  
>  	if (frags) {
>  		/* Exercise flow dissector code path. */
> -		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
> +		u32 headlen = eth_get_headlen(dev_net(tun->dev), skb->data,
> +					      skb_headlen(skb));
>  
>  		if (unlikely(headlen > skb_headlen(skb))) {
>  			this_cpu_inc(tun->pcpu_stats->rx_dropped);
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index e2f3b21cd72a..71a441ffab3f 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -33,7 +33,7 @@ struct device;
>  int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
>  unsigned char *arch_get_platform_mac_address(void);
>  int nvmem_get_mac_address(struct device *dev, void *addrbuf);
> -u32 eth_get_headlen(void *data, unsigned int max_len);
> +u32 eth_get_headlen(const struct net *net, void *data, unsigned int max_len);
>  __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
>  extern const struct header_ops eth_header_ops;
>  
> diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
> index 1e439549c419..0202e72e20a4 100644
> --- a/net/ethernet/eth.c
> +++ b/net/ethernet/eth.c
> @@ -119,13 +119,14 @@ EXPORT_SYMBOL(eth_header);
>  
>  /**
>   * eth_get_headlen - determine the length of header for an ethernet frame
> + * @net: pointer to device network namespace
>   * @data: pointer to start of frame
>   * @len: total length of frame
>   *
>   * Make a best effort attempt to pull the length for all of the headers for
>   * a given frame in a linear buffer.
>   */
> -u32 eth_get_headlen(void *data, unsigned int len)
> +u32 eth_get_headlen(const struct net *net, void *data, unsigned int len)

would it make sense to future proof it a little bit and pass 'dev'
into eth_get_headlen() instead of 'net' ?
May be tomorrow we'd want different flow_dissectors per-device
in addition to per-net ?

Also please add C based test for skb-less flow_dissector.
Current test_flow_dissector.sh doesn't seem to cover it.
Stanislav Fomichev April 19, 2019, 12:43 a.m. UTC | #2
On 04/18, Alexei Starovoitov wrote:
> On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > Update all users eth_get_headlen to pass network namespace
> > and pass it down to the flow dissector. This commit is a noop
> > until administrator inserts BPF flow dissector program.
> > 
> > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > Cc: intel-wired-lan@lists.osuosl.org
> > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > Cc: Salil Mehta <salil.mehta@huawei.com>
> > Cc: Michael Chan <michael.chan@broadcom.com>
> > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  drivers/net/ethernet/aquantia/atlantic/aq_ring.c  | 3 ++-
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 2 +-
> >  drivers/net/ethernet/hisilicon/hns/hns_enet.c     | 3 ++-
> >  drivers/net/ethernet/hisilicon/hns3/hns3_enet.c   | 3 ++-
> >  drivers/net/ethernet/intel/fm10k/fm10k_main.c     | 2 +-
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c       | 3 ++-
> >  drivers/net/ethernet/intel/iavf/iavf_txrx.c       | 3 ++-
> >  drivers/net/ethernet/intel/ice/ice_txrx.c         | 3 ++-
> >  drivers/net/ethernet/intel/igb/igb_main.c         | 3 ++-
> >  drivers/net/ethernet/intel/igc/igc_main.c         | 3 ++-
> >  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 2 +-
> >  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 ++-
> >  drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 3 ++-
> >  drivers/net/tun.c                                 | 3 ++-
> >  include/linux/etherdevice.h                       | 2 +-
> >  net/ethernet/eth.c                                | 5 +++--
> >  16 files changed, 29 insertions(+), 17 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> > index c64e2fb5a4f1..1b3181f757b7 100644
> > --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> > +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> > @@ -354,7 +354,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
> >  
> >  			hdr_len = buff->len;
> >  			if (hdr_len > AQ_CFG_RX_HDR_SIZE)
> > -				hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata),
> > +				hdr_len = eth_get_headlen(dev_net(skb->dev),
> > +							  aq_buf_vaddr(&buff->rxdata),
> >  							  AQ_CFG_RX_HDR_SIZE);
> >  
> >  			memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 6528a597367b..8bb5f708ccc6 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -899,7 +899,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
> >  			     DMA_ATTR_WEAK_ORDERING);
> >  
> >  	if (unlikely(!payload))
> > -		payload = eth_get_headlen(data_ptr, len);
> > +		payload = eth_get_headlen(dev_net(bp->dev), data_ptr, len);
> >  
> >  	skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
> >  	if (!skb) {
> > diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> > index 297b95c1b3c1..f1ecc78d2323 100644
> > --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> > +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
> > @@ -598,7 +598,8 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
> >  	} else {
> >  		ring->stats.seg_pkt_cnt++;
> >  
> > -		pull_len = eth_get_headlen(va, HNS_RX_HEAD_SIZE);
> > +		pull_len = eth_get_headlen(dev_net(ndev), va,
> > +					   HNS_RX_HEAD_SIZE);
> >  		memcpy(__skb_put(skb, pull_len), va,
> >  		       ALIGN(pull_len, sizeof(long)));
> >  
> > diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> > index b53b0911ec24..423d9ce0f6f8 100644
> > --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> > +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> > @@ -2457,7 +2457,8 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, int length,
> >  	ring->stats.seg_pkt_cnt++;
> >  	u64_stats_update_end(&ring->syncp);
> >  
> > -	ring->pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE);
> > +	ring->pull_len = eth_get_headlen(dev_net(netdev), va,
> > +					 HNS3_RX_HEAD_SIZE);
> >  	__skb_put(skb, ring->pull_len);
> >  	hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
> >  			    desc_cb);
> > diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> > index 2325cee76211..e2bee187d652 100644
> > --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> > +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> > @@ -280,7 +280,7 @@ static bool fm10k_add_rx_frag(struct fm10k_rx_buffer *rx_buffer,
> >  	/* we need the header to contain the greater of either ETH_HLEN or
> >  	 * 60 bytes if the skb->len is less than 60 for skb_pad.
> >  	 */
> > -	pull_len = eth_get_headlen(va, FM10K_RX_HDR_LEN);
> > +	pull_len = eth_get_headlen(dev_net(skb->dev), va, FM10K_RX_HDR_LEN);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > index 1a95223c9f99..85c5b503e0a0 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > @@ -2035,7 +2035,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > I40E_RX_HDR_SIZE)
> > -		headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
> > +					  I40E_RX_HDR_SIZE);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), xdp->data,
> > diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> > index b64187753ad6..23a62d7d0f9f 100644
> > --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> > +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> > @@ -1315,7 +1315,8 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > IAVF_RX_HDR_SIZE)
> > -		headlen = eth_get_headlen(va, IAVF_RX_HDR_SIZE);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> > +					  IAVF_RX_HDR_SIZE);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > index a6f7b7feaf3c..2692b9333055 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > @@ -698,7 +698,8 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > ICE_RX_HDR_SIZE)
> > -		headlen = eth_get_headlen(va, ICE_RX_HDR_SIZE);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> > +					  ICE_RX_HDR_SIZE);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> > index acbb5b4f333d..2023e1800c8d 100644
> > --- a/drivers/net/ethernet/intel/igb/igb_main.c
> > +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> > @@ -8051,7 +8051,8 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > IGB_RX_HDR_LEN)
> > -		headlen = eth_get_headlen(va, IGB_RX_HDR_LEN);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> > +					  IGB_RX_HDR_LEN);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> > index f79728381e8a..265a9d8a8421 100644
> > --- a/drivers/net/ethernet/intel/igc/igc_main.c
> > +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> > @@ -1199,7 +1199,8 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > IGC_RX_HDR_LEN)
> > -		headlen = eth_get_headlen(va, IGC_RX_HDR_LEN);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), va,
> > +					  IGC_RX_HDR_LEN);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > index 60cec3540dd7..5e5294567ca1 100644
> > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > @@ -1800,7 +1800,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
> >  	 * we need the header to contain the greater of either ETH_HLEN or
> >  	 * 60 bytes if the skb->len is less than 60 for skb_pad.
> >  	 */
> > -	pull_len = eth_get_headlen(va, IXGBE_RX_HDR_SIZE);
> > +	pull_len = eth_get_headlen(dev_net(skb->dev), va, IXGBE_RX_HDR_SIZE);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > index 49e23afa05a2..252fe0de6b56 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > @@ -895,7 +895,8 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
> >  	/* Determine available headroom for copy */
> >  	headlen = size;
> >  	if (headlen > IXGBEVF_RX_HDR_SIZE)
> > -		headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE);
> > +		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
> > +					  IXGBEVF_RX_HDR_SIZE);
> >  
> >  	/* align pull length to size of long to optimize memcpy performance */
> >  	memcpy(__skb_put(skb, headlen), xdp->data,
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> > index 40f3f98aa279..efcc27756c7e 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> > @@ -163,7 +163,8 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
> >  	case MLX5_INLINE_MODE_NONE:
> >  		return 0;
> >  	case MLX5_INLINE_MODE_TCP_UDP:
> > -		hlen = eth_get_headlen(skb->data, skb_headlen(skb));
> > +		hlen = eth_get_headlen(dev_net(skb->dev), skb->data,
> > +				       skb_headlen(skb));
> >  		if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
> >  			hlen += VLAN_HLEN;
> >  		break;
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index 24d0220b9ba0..6d5c8ecfea1e 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -1965,7 +1965,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
> >  
> >  	if (frags) {
> >  		/* Exercise flow dissector code path. */
> > -		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
> > +		u32 headlen = eth_get_headlen(dev_net(tun->dev), skb->data,
> > +					      skb_headlen(skb));
> >  
> >  		if (unlikely(headlen > skb_headlen(skb))) {
> >  			this_cpu_inc(tun->pcpu_stats->rx_dropped);
> > diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> > index e2f3b21cd72a..71a441ffab3f 100644
> > --- a/include/linux/etherdevice.h
> > +++ b/include/linux/etherdevice.h
> > @@ -33,7 +33,7 @@ struct device;
> >  int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
> >  unsigned char *arch_get_platform_mac_address(void);
> >  int nvmem_get_mac_address(struct device *dev, void *addrbuf);
> > -u32 eth_get_headlen(void *data, unsigned int max_len);
> > +u32 eth_get_headlen(const struct net *net, void *data, unsigned int max_len);
> >  __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
> >  extern const struct header_ops eth_header_ops;
> >  
> > diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
> > index 1e439549c419..0202e72e20a4 100644
> > --- a/net/ethernet/eth.c
> > +++ b/net/ethernet/eth.c
> > @@ -119,13 +119,14 @@ EXPORT_SYMBOL(eth_header);
> >  
> >  /**
> >   * eth_get_headlen - determine the length of header for an ethernet frame
> > + * @net: pointer to device network namespace
> >   * @data: pointer to start of frame
> >   * @len: total length of frame
> >   *
> >   * Make a best effort attempt to pull the length for all of the headers for
> >   * a given frame in a linear buffer.
> >   */
> > -u32 eth_get_headlen(void *data, unsigned int len)
> > +u32 eth_get_headlen(const struct net *net, void *data, unsigned int len)
> 
> would it make sense to future proof it a little bit and pass 'dev'
> into eth_get_headlen() instead of 'net' ?
> May be tomorrow we'd want different flow_dissectors per-device
> in addition to per-net ?
Good point, will use net_device.

> Also please add C based test for skb-less flow_dissector.
> Current test_flow_dissector.sh doesn't seem to cover it.
It doesn't look like we can exercise skb-less flow dissector from
test_flow_dissector.sh; we need to trigger some driver code, which is
hard when we send the packets on the localhost in
test_flow_dissector.sh.

To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
tests skb-less mode.
Alexei Starovoitov April 19, 2019, 4:50 a.m. UTC | #3
On Thu, Apr 18, 2019 at 05:43:50PM -0700, Stanislav Fomichev wrote:
> On 04/18, Alexei Starovoitov wrote:
> > On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > > Update all users eth_get_headlen to pass network namespace
> > > and pass it down to the flow dissector. This commit is a noop
> > > until administrator inserts BPF flow dissector program.
> > > 
> > > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > Cc: intel-wired-lan@lists.osuosl.org
> > > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > > Cc: Salil Mehta <salil.mehta@huawei.com>
> > > Cc: Michael Chan <michael.chan@broadcom.com>
> > > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
... 
> > Also please add C based test for skb-less flow_dissector.
> > Current test_flow_dissector.sh doesn't seem to cover it.
> It doesn't look like we can exercise skb-less flow dissector from
> test_flow_dissector.sh; we need to trigger some driver code, which is
> hard when we send the packets on the localhost in
> test_flow_dissector.sh.
> 
> To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
> mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
> tests skb-less mode.

I saw that but I'm afraid it's not enough.
tun_get_user() is calling it, so it should be possible to test
skb-less mode via tun.
Stanislav Fomichev April 19, 2019, 11:29 p.m. UTC | #4
On 04/18, Alexei Starovoitov wrote:
> On Thu, Apr 18, 2019 at 05:43:50PM -0700, Stanislav Fomichev wrote:
> > On 04/18, Alexei Starovoitov wrote:
> > > On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > > > Update all users eth_get_headlen to pass network namespace
> > > > and pass it down to the flow dissector. This commit is a noop
> > > > until administrator inserts BPF flow dissector program.
> > > > 
> > > > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > > > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > Cc: intel-wired-lan@lists.osuosl.org
> > > > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > > > Cc: Salil Mehta <salil.mehta@huawei.com>
> > > > Cc: Michael Chan <michael.chan@broadcom.com>
> > > > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ... 
> > > Also please add C based test for skb-less flow_dissector.
> > > Current test_flow_dissector.sh doesn't seem to cover it.
> > It doesn't look like we can exercise skb-less flow dissector from
> > test_flow_dissector.sh; we need to trigger some driver code, which is
> > hard when we send the packets on the localhost in
> > test_flow_dissector.sh.
> > 
> > To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
> > mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
> > tests skb-less mode.
> 
> I saw that but I'm afraid it's not enough.
> tun_get_user() is calling it, so it should be possible to test
> skb-less mode via tun.
Spent some time today looking into how to exercise this path in the tun
driver: doing writev() with IFF_NAPI_FRAGS IFF_TAP device would trigger
eth_get_headlen, but it looks like there is no way to do a test with
pass/no-pass result around that.

The problem is - we don't actually do anything with the result of
eth_get_headlen, there is only a sanity check for "headlen >
skb_headlen(skb)" which can't trigger for BPF flow dissector; we
carefully clamp thoff/nhoff and should not return offset outside the
input buffer.

By reading git history it looks like this call to eth_get_headlen was
added there to only make it possible for tools like syzbot to fuzz flow
dissector. That's why we don't care about the result, we just do that
simple sanity check. The main goal is to trigger some problem
(loop/warning) in the flow dissector code.

tl;dr - no mater which bpf flow dissector is attached to the namespace,
it would not change behavior of the tun device; even empty 'return
false' program would not alter it.

Let me know if you had something different in mind; because so far I
don't see how to do a test around that. Changing that "headlen >
skb_headlen(skb)" check into something meaningful also doesn't seem possible.
I thought about checking the result of eth_get_headlen against
skb_transport_offset(), but at that point transport offset of the skb
is not yet set (napi_gro_frags and gro layer later does that) :-(
Alexei Starovoitov April 19, 2019, 11:37 p.m. UTC | #5
On Fri, Apr 19, 2019 at 04:29:44PM -0700, Stanislav Fomichev wrote:
> On 04/18, Alexei Starovoitov wrote:
> > On Thu, Apr 18, 2019 at 05:43:50PM -0700, Stanislav Fomichev wrote:
> > > On 04/18, Alexei Starovoitov wrote:
> > > > On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > > > > Update all users eth_get_headlen to pass network namespace
> > > > > and pass it down to the flow dissector. This commit is a noop
> > > > > until administrator inserts BPF flow dissector program.
> > > > > 
> > > > > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > > > > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > > > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > > Cc: intel-wired-lan@lists.osuosl.org
> > > > > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > > > > Cc: Salil Mehta <salil.mehta@huawei.com>
> > > > > Cc: Michael Chan <michael.chan@broadcom.com>
> > > > > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ... 
> > > > Also please add C based test for skb-less flow_dissector.
> > > > Current test_flow_dissector.sh doesn't seem to cover it.
> > > It doesn't look like we can exercise skb-less flow dissector from
> > > test_flow_dissector.sh; we need to trigger some driver code, which is
> > > hard when we send the packets on the localhost in
> > > test_flow_dissector.sh.
> > > 
> > > To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
> > > mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
> > > tests skb-less mode.
> > 
> > I saw that but I'm afraid it's not enough.
> > tun_get_user() is calling it, so it should be possible to test
> > skb-less mode via tun.
> Spent some time today looking into how to exercise this path in the tun
> driver: doing writev() with IFF_NAPI_FRAGS IFF_TAP device would trigger
> eth_get_headlen, but it looks like there is no way to do a test with
> pass/no-pass result around that.
> 
> The problem is - we don't actually do anything with the result of
> eth_get_headlen, there is only a sanity check for "headlen >
> skb_headlen(skb)" which can't trigger for BPF flow dissector; we
> carefully clamp thoff/nhoff and should not return offset outside the
> input buffer.
> 
> By reading git history it looks like this call to eth_get_headlen was
> added there to only make it possible for tools like syzbot to fuzz flow
> dissector. That's why we don't care about the result, we just do that
> simple sanity check. The main goal is to trigger some problem
> (loop/warning) in the flow dissector code.
> 
> tl;dr - no mater which bpf flow dissector is attached to the namespace,
> it would not change behavior of the tun device; even empty 'return
> false' program would not alter it.

sure, but the program will run and the test can validate that the program
saw valid packet, parsed it correctly and returned correct dissection.
The results can be stored in a map and validated by the test.
iirc you were saying that you'll have one program doing dissection
for with-skb and skb-less cases.
I think it's important to have such program in selftests and being
run continuously for both cases.
Stanislav Fomichev April 19, 2019, 11:47 p.m. UTC | #6
On 04/19, Alexei Starovoitov wrote:
> On Fri, Apr 19, 2019 at 04:29:44PM -0700, Stanislav Fomichev wrote:
> > On 04/18, Alexei Starovoitov wrote:
> > > On Thu, Apr 18, 2019 at 05:43:50PM -0700, Stanislav Fomichev wrote:
> > > > On 04/18, Alexei Starovoitov wrote:
> > > > > On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > > > > > Update all users eth_get_headlen to pass network namespace
> > > > > > and pass it down to the flow dissector. This commit is a noop
> > > > > > until administrator inserts BPF flow dissector program.
> > > > > > 
> > > > > > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > > > > > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > > > > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > > > Cc: intel-wired-lan@lists.osuosl.org
> > > > > > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > > > > > Cc: Salil Mehta <salil.mehta@huawei.com>
> > > > > > Cc: Michael Chan <michael.chan@broadcom.com>
> > > > > > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > ... 
> > > > > Also please add C based test for skb-less flow_dissector.
> > > > > Current test_flow_dissector.sh doesn't seem to cover it.
> > > > It doesn't look like we can exercise skb-less flow dissector from
> > > > test_flow_dissector.sh; we need to trigger some driver code, which is
> > > > hard when we send the packets on the localhost in
> > > > test_flow_dissector.sh.
> > > > 
> > > > To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
> > > > mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
> > > > tests skb-less mode.
> > > 
> > > I saw that but I'm afraid it's not enough.
> > > tun_get_user() is calling it, so it should be possible to test
> > > skb-less mode via tun.
> > Spent some time today looking into how to exercise this path in the tun
> > driver: doing writev() with IFF_NAPI_FRAGS IFF_TAP device would trigger
> > eth_get_headlen, but it looks like there is no way to do a test with
> > pass/no-pass result around that.
> > 
> > The problem is - we don't actually do anything with the result of
> > eth_get_headlen, there is only a sanity check for "headlen >
> > skb_headlen(skb)" which can't trigger for BPF flow dissector; we
> > carefully clamp thoff/nhoff and should not return offset outside the
> > input buffer.
> > 
> > By reading git history it looks like this call to eth_get_headlen was
> > added there to only make it possible for tools like syzbot to fuzz flow
> > dissector. That's why we don't care about the result, we just do that
> > simple sanity check. The main goal is to trigger some problem
> > (loop/warning) in the flow dissector code.
> > 
> > tl;dr - no mater which bpf flow dissector is attached to the namespace,
> > it would not change behavior of the tun device; even empty 'return
> > false' program would not alter it.
> 
> sure, but the program will run and the test can validate that the program
> saw valid packet, parsed it correctly and returned correct dissection.
> The results can be stored in a map and validated by the test.
SG, that's doable; that would make bpf_flow.c less generic because it would
have to have this map which would export the last dissection, but that
should be fine, I guess.

(I planned to use bpf_flow.c internally instead of writing another one).

> iirc you were saying that you'll have one program doing dissection
> for with-skb and skb-less cases.
Correct.

> I think it's important to have such program in selftests and being
> run continuously for both cases.
Ok, in this case, I can write a small userspace program that writes some
dummy packet into a tap device (triggers eth_get_headlen) and reads back
and verifies bpf_flow_keys from the shared map.

Correct me if I misunderstood something. Otherwise, I'll get back to you
with a v6+test.
Alexei Starovoitov April 19, 2019, 11:50 p.m. UTC | #7
On Fri, Apr 19, 2019 at 04:47:44PM -0700, Stanislav Fomichev wrote:
> On 04/19, Alexei Starovoitov wrote:
> > On Fri, Apr 19, 2019 at 04:29:44PM -0700, Stanislav Fomichev wrote:
> > > On 04/18, Alexei Starovoitov wrote:
> > > > On Thu, Apr 18, 2019 at 05:43:50PM -0700, Stanislav Fomichev wrote:
> > > > > On 04/18, Alexei Starovoitov wrote:
> > > > > > On Mon, Apr 15, 2019 at 10:38:00AM -0700, Stanislav Fomichev wrote:
> > > > > > > Update all users eth_get_headlen to pass network namespace
> > > > > > > and pass it down to the flow dissector. This commit is a noop
> > > > > > > until administrator inserts BPF flow dissector program.
> > > > > > > 
> > > > > > > Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
> > > > > > > Cc: Saeed Mahameed <saeedm@mellanox.com>
> > > > > > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > > > > Cc: intel-wired-lan@lists.osuosl.org
> > > > > > > Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> > > > > > > Cc: Salil Mehta <salil.mehta@huawei.com>
> > > > > > > Cc: Michael Chan <michael.chan@broadcom.com>
> > > > > > > Cc: Igor Russkikh <igor.russkikh@aquantia.com>
> > > > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > ... 
> > > > > > Also please add C based test for skb-less flow_dissector.
> > > > > > Current test_flow_dissector.sh doesn't seem to cover it.
> > > > > It doesn't look like we can exercise skb-less flow dissector from
> > > > > test_flow_dissector.sh; we need to trigger some driver code, which is
> > > > > hard when we send the packets on the localhost in
> > > > > test_flow_dissector.sh.
> > > > > 
> > > > > To test skb-less dissector I convert BPF_PROG_TEST_RUN to always use skb-less
> > > > > mode. test_flow_dissector.sh tests skb-mode, prog_tests/flow_dissector.c
> > > > > tests skb-less mode.
> > > > 
> > > > I saw that but I'm afraid it's not enough.
> > > > tun_get_user() is calling it, so it should be possible to test
> > > > skb-less mode via tun.
> > > Spent some time today looking into how to exercise this path in the tun
> > > driver: doing writev() with IFF_NAPI_FRAGS IFF_TAP device would trigger
> > > eth_get_headlen, but it looks like there is no way to do a test with
> > > pass/no-pass result around that.
> > > 
> > > The problem is - we don't actually do anything with the result of
> > > eth_get_headlen, there is only a sanity check for "headlen >
> > > skb_headlen(skb)" which can't trigger for BPF flow dissector; we
> > > carefully clamp thoff/nhoff and should not return offset outside the
> > > input buffer.
> > > 
> > > By reading git history it looks like this call to eth_get_headlen was
> > > added there to only make it possible for tools like syzbot to fuzz flow
> > > dissector. That's why we don't care about the result, we just do that
> > > simple sanity check. The main goal is to trigger some problem
> > > (loop/warning) in the flow dissector code.
> > > 
> > > tl;dr - no mater which bpf flow dissector is attached to the namespace,
> > > it would not change behavior of the tun device; even empty 'return
> > > false' program would not alter it.
> > 
> > sure, but the program will run and the test can validate that the program
> > saw valid packet, parsed it correctly and returned correct dissection.
> > The results can be stored in a map and validated by the test.
> SG, that's doable; that would make bpf_flow.c less generic because it would
> have to have this map which would export the last dissection, but that
> should be fine, I guess.
> 
> (I planned to use bpf_flow.c internally instead of writing another one).
> 
> > iirc you were saying that you'll have one program doing dissection
> > for with-skb and skb-less cases.
> Correct.
> 
> > I think it's important to have such program in selftests and being
> > run continuously for both cases.
> Ok, in this case, I can write a small userspace program that writes some
> dummy packet into a tap device (triggers eth_get_headlen) and reads back
> and verifies bpf_flow_keys from the shared map.

that sounds good. there could be two test programs. bpf_flow.c that you
use internally and another one, but please make that other one to test
both with-skb and skb-less paths.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index c64e2fb5a4f1..1b3181f757b7 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -354,7 +354,8 @@  int aq_ring_rx_clean(struct aq_ring_s *self,
 
 			hdr_len = buff->len;
 			if (hdr_len > AQ_CFG_RX_HDR_SIZE)
-				hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata),
+				hdr_len = eth_get_headlen(dev_net(skb->dev),
+							  aq_buf_vaddr(&buff->rxdata),
 							  AQ_CFG_RX_HDR_SIZE);
 
 			memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6528a597367b..8bb5f708ccc6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -899,7 +899,7 @@  static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
 			     DMA_ATTR_WEAK_ORDERING);
 
 	if (unlikely(!payload))
-		payload = eth_get_headlen(data_ptr, len);
+		payload = eth_get_headlen(dev_net(bp->dev), data_ptr, len);
 
 	skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
 	if (!skb) {
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 297b95c1b3c1..f1ecc78d2323 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -598,7 +598,8 @@  static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
 	} else {
 		ring->stats.seg_pkt_cnt++;
 
-		pull_len = eth_get_headlen(va, HNS_RX_HEAD_SIZE);
+		pull_len = eth_get_headlen(dev_net(ndev), va,
+					   HNS_RX_HEAD_SIZE);
 		memcpy(__skb_put(skb, pull_len), va,
 		       ALIGN(pull_len, sizeof(long)));
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index b53b0911ec24..423d9ce0f6f8 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2457,7 +2457,8 @@  static int hns3_alloc_skb(struct hns3_enet_ring *ring, int length,
 	ring->stats.seg_pkt_cnt++;
 	u64_stats_update_end(&ring->syncp);
 
-	ring->pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE);
+	ring->pull_len = eth_get_headlen(dev_net(netdev), va,
+					 HNS3_RX_HEAD_SIZE);
 	__skb_put(skb, ring->pull_len);
 	hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
 			    desc_cb);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 2325cee76211..e2bee187d652 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -280,7 +280,7 @@  static bool fm10k_add_rx_frag(struct fm10k_rx_buffer *rx_buffer,
 	/* we need the header to contain the greater of either ETH_HLEN or
 	 * 60 bytes if the skb->len is less than 60 for skb_pad.
 	 */
-	pull_len = eth_get_headlen(va, FM10K_RX_HDR_LEN);
+	pull_len = eth_get_headlen(dev_net(skb->dev), va, FM10K_RX_HDR_LEN);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1a95223c9f99..85c5b503e0a0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2035,7 +2035,8 @@  static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > I40E_RX_HDR_SIZE)
-		headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
+		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
+					  I40E_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), xdp->data,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index b64187753ad6..23a62d7d0f9f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1315,7 +1315,8 @@  static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IAVF_RX_HDR_SIZE)
-		headlen = eth_get_headlen(va, IAVF_RX_HDR_SIZE);
+		headlen = eth_get_headlen(dev_net(skb->dev), va,
+					  IAVF_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index a6f7b7feaf3c..2692b9333055 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -698,7 +698,8 @@  ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > ICE_RX_HDR_SIZE)
-		headlen = eth_get_headlen(va, ICE_RX_HDR_SIZE);
+		headlen = eth_get_headlen(dev_net(skb->dev), va,
+					  ICE_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index acbb5b4f333d..2023e1800c8d 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8051,7 +8051,8 @@  static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IGB_RX_HDR_LEN)
-		headlen = eth_get_headlen(va, IGB_RX_HDR_LEN);
+		headlen = eth_get_headlen(dev_net(skb->dev), va,
+					  IGB_RX_HDR_LEN);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index f79728381e8a..265a9d8a8421 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1199,7 +1199,8 @@  static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IGC_RX_HDR_LEN)
-		headlen = eth_get_headlen(va, IGC_RX_HDR_LEN);
+		headlen = eth_get_headlen(dev_net(skb->dev), va,
+					  IGC_RX_HDR_LEN);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 60cec3540dd7..5e5294567ca1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1800,7 +1800,7 @@  static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
 	 * we need the header to contain the greater of either ETH_HLEN or
 	 * 60 bytes if the skb->len is less than 60 for skb_pad.
 	 */
-	pull_len = eth_get_headlen(va, IXGBE_RX_HDR_SIZE);
+	pull_len = eth_get_headlen(dev_net(skb->dev), va, IXGBE_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 49e23afa05a2..252fe0de6b56 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -895,7 +895,8 @@  struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IXGBEVF_RX_HDR_SIZE)
-		headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE);
+		headlen = eth_get_headlen(dev_net(skb->dev), xdp->data,
+					  IXGBEVF_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
 	memcpy(__skb_put(skb, headlen), xdp->data,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 40f3f98aa279..efcc27756c7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -163,7 +163,8 @@  static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
 	case MLX5_INLINE_MODE_NONE:
 		return 0;
 	case MLX5_INLINE_MODE_TCP_UDP:
-		hlen = eth_get_headlen(skb->data, skb_headlen(skb));
+		hlen = eth_get_headlen(dev_net(skb->dev), skb->data,
+				       skb_headlen(skb));
 		if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
 			hlen += VLAN_HLEN;
 		break;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 24d0220b9ba0..6d5c8ecfea1e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1965,7 +1965,8 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	if (frags) {
 		/* Exercise flow dissector code path. */
-		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+		u32 headlen = eth_get_headlen(dev_net(tun->dev), skb->data,
+					      skb_headlen(skb));
 
 		if (unlikely(headlen > skb_headlen(skb))) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index e2f3b21cd72a..71a441ffab3f 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -33,7 +33,7 @@  struct device;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-u32 eth_get_headlen(void *data, unsigned int max_len);
+u32 eth_get_headlen(const struct net *net, void *data, unsigned int max_len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 1e439549c419..0202e72e20a4 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -119,13 +119,14 @@  EXPORT_SYMBOL(eth_header);
 
 /**
  * eth_get_headlen - determine the length of header for an ethernet frame
+ * @net: pointer to device network namespace
  * @data: pointer to start of frame
  * @len: total length of frame
  *
  * Make a best effort attempt to pull the length for all of the headers for
  * a given frame in a linear buffer.
  */
-u32 eth_get_headlen(void *data, unsigned int len)
+u32 eth_get_headlen(const struct net *net, void *data, unsigned int len)
 {
 	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
@@ -136,7 +137,7 @@  u32 eth_get_headlen(void *data, unsigned int len)
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!skb_flow_dissect_flow_keys_basic(NULL, NULL, &keys, data,
+	if (!skb_flow_dissect_flow_keys_basic(net, NULL, &keys, data,
 					      eth->h_proto, sizeof(*eth),
 					      len, flags))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));