diff mbox

[1/2] net: Remove ndo_xmit_flush netdev operation, use signalling instead.

Message ID 20140825.163502.973913220915588977.davem@davemloft.net
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller Aug. 25, 2014, 11:35 p.m. UTC
As reported by Jesper Dangaard Brouer, for high packet rates the
overhead of having another indirect call in the TX path is
non-trivial.

There is the indirect call itself, and then there is all of the
reloading of the state to refetch the tail pointer value and
then write the device register.

Move to a more passive scheme, which requires very light modifications
to the device drivers.

The signal is a new skb->xmit_more value, if it is non-zero it means
that more SKBs are pending to be transmitted on the same queue as the
current SKB.  And therefore, the driver may elide the tail pointer
update.

Right now skb->xmit_more is always zero.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++--------------------
 drivers/net/virtio_net.c                  | 12 +++--------
 include/linux/netdevice.h                 | 25 ++-------------------
 include/linux/skbuff.h                    |  2 ++
 4 files changed, 19 insertions(+), 56 deletions(-)

Comments

Tom Herbert Aug. 26, 2014, 3:42 a.m. UTC | #1
On Mon, Aug 25, 2014 at 4:35 PM, David Miller <davem@davemloft.net> wrote:
>
> As reported by Jesper Dangaard Brouer, for high packet rates the
> overhead of having another indirect call in the TX path is
> non-trivial.
>
> There is the indirect call itself, and then there is all of the
> reloading of the state to refetch the tail pointer value and
> then write the device register.
>
> Move to a more passive scheme, which requires very light modifications
> to the device drivers.
>
> The signal is a new skb->xmit_more value, if it is non-zero it means
> that more SKBs are pending to be transmitted on the same queue as the
> current SKB.  And therefore, the driver may elide the tail pointer
> update.
>
> Right now skb->xmit_more is always zero.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++--------------------
>  drivers/net/virtio_net.c                  | 12 +++--------
>  include/linux/netdevice.h                 | 25 ++-------------------
>  include/linux/skbuff.h                    |  2 ++
>  4 files changed, 19 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index b9c020a..89c29b4 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -136,7 +136,6 @@ static void igb_update_phy_info(unsigned long);
>  static void igb_watchdog(unsigned long);
>  static void igb_watchdog_task(struct work_struct *);
>  static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue);
>  static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
>                                           struct rtnl_link_stats64 *stats);
>  static int igb_change_mtu(struct net_device *, int);
> @@ -2076,7 +2075,6 @@ static const struct net_device_ops igb_netdev_ops = {
>         .ndo_open               = igb_open,
>         .ndo_stop               = igb_close,
>         .ndo_start_xmit         = igb_xmit_frame,
> -       .ndo_xmit_flush         = igb_xmit_flush,
>         .ndo_get_stats64        = igb_get_stats64,
>         .ndo_set_rx_mode        = igb_set_rx_mode,
>         .ndo_set_mac_address    = igb_set_mac,
> @@ -4917,6 +4915,14 @@ static void igb_tx_map(struct igb_ring *tx_ring,
>
>         tx_ring->next_to_use = i;
>
> +       if (!skb->xmit_more) {
> +               writel(i, tx_ring->tail);
> +
> +               /* we need this if more than one processor can write to our tail
> +                * at a time, it synchronizes IO on IA64/Altix systems
> +                */
> +               mmiowb();
> +       }
>         return;
>
I would suggest the flush should be done if !skb->xmit_more or queue
is being stopped. So maybe pull this code into it's own function e.g.
igb_flush(tx_ring). Then do:

if (igb_maybe_stop_tx(tx_ring, DESC_NEEDED) || !skb->more)
   igb_flush(tx_ring);

>  dma_error:
> @@ -5052,20 +5058,17 @@ out_drop:
>         return NETDEV_TX_OK;
>  }
>
> -static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx)
> +static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> +                                                   struct sk_buff *skb)
>  {
> +       unsigned int r_idx = skb->queue_mapping;
> +
>         if (r_idx >= adapter->num_tx_queues)
>                 r_idx = r_idx % adapter->num_tx_queues;
>
>         return adapter->tx_ring[r_idx];
>  }
>
> -static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> -                                                   struct sk_buff *skb)
> -{
> -       return __igb_tx_queue_mapping(adapter, skb->queue_mapping);
> -}
> -
>  static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>                                   struct net_device *netdev)
>  {
> @@ -5094,21 +5097,6 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>         return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
>  }
>
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue)
> -{
> -       struct igb_adapter *adapter = netdev_priv(netdev);
> -       struct igb_ring *tx_ring;
> -
> -       tx_ring = __igb_tx_queue_mapping(adapter, queue);
> -
> -       writel(tx_ring->next_to_use, tx_ring->tail);
> -
> -       /* we need this if more than one processor can write to our tail
> -        * at a time, it synchronizes IO on IA64/Altix systems
> -        */
> -       mmiowb();
> -}
> -
>  /**
>   *  igb_tx_timeout - Respond to a Tx Hang
>   *  @netdev: network interface device structure
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6242108..f0c2824 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -953,15 +953,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>                 }
>         }
>
> -       return NETDEV_TX_OK;
> -}
> +       if (!skb->xmit_more)
> +               virtqueue_kick(sq->vq);
>
> -static void xmit_flush(struct net_device *dev, u16 qnum)
> -{
> -       struct virtnet_info *vi = netdev_priv(dev);
> -       struct send_queue *sq = &vi->sq[qnum];
> -
> -       virtqueue_kick(sq->vq);
> +       return NETDEV_TX_OK;
>  }
>
>  /*
> @@ -1393,7 +1388,6 @@ static const struct net_device_ops virtnet_netdev = {
>         .ndo_open            = virtnet_open,
>         .ndo_stop            = virtnet_close,
>         .ndo_start_xmit      = start_xmit,
> -       .ndo_xmit_flush      = xmit_flush,
>         .ndo_validate_addr   = eth_validate_addr,
>         .ndo_set_mac_address = virtnet_set_mac_address,
>         .ndo_set_rx_mode     = virtnet_set_rx_mode,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 220c509..039b237 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -782,19 +782,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
>   *     Required can not be NULL.
>   *
> - * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
> - *     A driver implements this function when it wishes to support
> - *     deferred TX queue flushing.  The idea is that the expensive
> - *     operation to trigger TX queue processing can be done after
> - *     N calls to ndo_start_xmit rather than being done every single
> - *     time.  In this regime ndo_start_xmit will be called one or more
> - *     times, and then a final ndo_xmit_flush call will be made to
> - *     have the driver tell the device about the new pending TX queue
> - *     entries.  The kernel keeps track of which queues need flushing
> - *     by monitoring skb->queue_mapping of the packets it submits to
> - *     ndo_start_xmit.  This is the queue value that will be passed
> - *     to ndo_xmit_flush.
> - *
>   * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
>   *                         void *accel_priv, select_queue_fallback_t fallback);
>   *     Called to decide which queue to when device supports multiple
> @@ -1018,7 +1005,6 @@ struct net_device_ops {
>         int                     (*ndo_stop)(struct net_device *dev);
>         netdev_tx_t             (*ndo_start_xmit) (struct sk_buff *skb,
>                                                    struct net_device *dev);
> -       void                    (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
>         u16                     (*ndo_select_queue)(struct net_device *dev,
>                                                     struct sk_buff *skb,
>                                                     void *accel_priv,
> @@ -3447,15 +3433,8 @@ int __init dev_proc_init(void);
>  static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
>                                               struct sk_buff *skb, struct net_device *dev)
>  {
> -       netdev_tx_t ret;
> -       u16 q;
> -
> -       q = skb->queue_mapping;
> -       ret = ops->ndo_start_xmit(skb, dev);
> -       if (dev_xmit_complete(ret) && ops->ndo_xmit_flush)
> -               ops->ndo_xmit_flush(dev, q);
> -
> -       return ret;
> +       skb->xmit_more = 0;
> +       return ops->ndo_start_xmit(skb, dev);
>  }
>
>  static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 18ddf96..9b3802a 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -452,6 +452,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
>   *     @tc_verd: traffic control verdict
>   *     @hash: the packet hash
>   *     @queue_mapping: Queue mapping for multiqueue devices
> + *     @xmit_more: More SKBs are pending for this queue
>   *     @ndisc_nodetype: router type (from link layer)
>   *     @ooo_okay: allow the mapping of a socket to a queue to be changed
>   *     @l4_hash: indicate hash is a canonical 4-tuple hash over transport
> @@ -558,6 +559,7 @@ struct sk_buff {
>
>         __u16                   queue_mapping;
>         kmemcheck_bitfield_begin(flags2);
> +       __u8                    xmit_more:1;
>  #ifdef CONFIG_IPV6_NDISC_NODETYPE
>         __u8                    ndisc_nodetype:2;
>  #endif
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Aug. 26, 2014, 4:50 a.m. UTC | #2
From: Tom Herbert <therbert@google.com>
Date: Mon, 25 Aug 2014 20:42:59 -0700

> I would suggest the flush should be done if !skb->xmit_more or queue
> is being stopped. So maybe pull this code into it's own function e.g.
> igb_flush(tx_ring). Then do:
> 
> if (igb_maybe_stop_tx(tx_ring, DESC_NEEDED) || !skb->more)
>    igb_flush(tx_ring);

Agreed, and we will need to sort this out in all cases before we start
actually doing real deferral by setting skb->xmit_more.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
yzhu1 Sept. 1, 2015, 6:46 a.m. UTC | #3
On 08/26/2014 07:35 AM, David Miller wrote:
> As reported by Jesper Dangaard Brouer, for high packet rates the
> overhead of having another indirect call in the TX path is
> non-trivial.
>
> There is the indirect call itself, and then there is all of the
> reloading of the state to refetch the tail pointer value and
> then write the device register.
>
> Move to a more passive scheme, which requires very light modifications
> to the device drivers.
>
> The signal is a new skb->xmit_more value, if it is non-zero it means
> that more SKBs are pending to be transmitted on the same queue as the
> current SKB.  And therefore, the driver may elide the tail pointer
> update.
>
> Right now skb->xmit_more is always zero.
Hi, Miller

After I applied this patch, the skb->xmit_more is not always zero.

My igb driver is as below:

09:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network 
Connection (rev 01)
     Subsystem: Intel Corporation I350 Gigabit Network Connection
     Flags: bus master, fast devsel, latency 0, IRQ 16
     Memory at d0960000 (32-bit, non-prefetchable) [size=128K]
     I/O ports at 2060 [size=32]
     Memory at d09b0000 (32-bit, non-prefetchable) [size=16K]
     Capabilities: [40] Power Management version 3
     Capabilities: [50] MSI: Enable- Count=1/1 Maskable+ 64bit+
     Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
     Capabilities: [a0] Express Endpoint, MSI 00
     Capabilities: [e0] Vital Product Data
     Capabilities: [100] Advanced Error Reporting
     Capabilities: [140] Device Serial Number 00-1e-67-ff-ff-65-8d-9c
     Capabilities: [150] Alternative Routing-ID Interpretation (ARI)
     Capabilities: [160] Single Root I/O Virtualization (SR-IOV)
     Capabilities: [1a0] Transaction Processing Hints
     Capabilities: [1c0] Latency Tolerance Reporting
     Capabilities: [1d0] Access Control Services
     Kernel driver in use: igb

And the logs are as below:

[  917.489934] func:igb_tx_map, line:4860,skb->xmit_more:1
[  929.616829] func:igb_tx_map, line:4860,skb->xmit_more:1
[  929.621548] func:igb_tx_map, line:4860,skb->xmit_more:1
[  940.675453] func:igb_tx_map, line:4860,skb->xmit_more:1
[  951.811508] func:igb_tx_map, line:4860,skb->xmit_more:1
[  961.903057] func:igb_tx_map, line:4860,skb->xmit_more:1
[  974.032852] func:igb_tx_map, line:4860,skb->xmit_more:1
[  974.075644] func:igb_tx_map, line:4860,skb->xmit_more:1

Maybe the assumption that skb->xmit_more is always 0 is not correct.

Sometimes skb->xmit_more is not always 0 because of compiler or other 
reasons.

Best Regards!
Zhu Yanjun

.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>   drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++--------------------
>   drivers/net/virtio_net.c                  | 12 +++--------
>   include/linux/netdevice.h                 | 25 ++-------------------
>   include/linux/skbuff.h                    |  2 ++
>   4 files changed, 19 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index b9c020a..89c29b4 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -136,7 +136,6 @@ static void igb_update_phy_info(unsigned long);
>   static void igb_watchdog(unsigned long);
>   static void igb_watchdog_task(struct work_struct *);
>   static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue);
>   static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
>   					  struct rtnl_link_stats64 *stats);
>   static int igb_change_mtu(struct net_device *, int);
> @@ -2076,7 +2075,6 @@ static const struct net_device_ops igb_netdev_ops = {
>   	.ndo_open		= igb_open,
>   	.ndo_stop		= igb_close,
>   	.ndo_start_xmit		= igb_xmit_frame,
> -	.ndo_xmit_flush		= igb_xmit_flush,
>   	.ndo_get_stats64	= igb_get_stats64,
>   	.ndo_set_rx_mode	= igb_set_rx_mode,
>   	.ndo_set_mac_address	= igb_set_mac,
> @@ -4917,6 +4915,14 @@ static void igb_tx_map(struct igb_ring *tx_ring,
>   
>   	tx_ring->next_to_use = i;
>   
> +	if (!skb->xmit_more) {
> +		writel(i, tx_ring->tail);
> +
> +		/* we need this if more than one processor can write to our tail
> +		 * at a time, it synchronizes IO on IA64/Altix systems
> +		 */
> +		mmiowb();
> +	}
>   	return;
>   
>   dma_error:
> @@ -5052,20 +5058,17 @@ out_drop:
>   	return NETDEV_TX_OK;
>   }
>   
> -static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx)
> +static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> +						    struct sk_buff *skb)
>   {
> +	unsigned int r_idx = skb->queue_mapping;
> +
>   	if (r_idx >= adapter->num_tx_queues)
>   		r_idx = r_idx % adapter->num_tx_queues;
>   
>   	return adapter->tx_ring[r_idx];
>   }
>   
> -static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
> -						    struct sk_buff *skb)
> -{
> -	return __igb_tx_queue_mapping(adapter, skb->queue_mapping);
> -}
> -
>   static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>   				  struct net_device *netdev)
>   {
> @@ -5094,21 +5097,6 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
>   	return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
>   }
>   
> -static void igb_xmit_flush(struct net_device *netdev, u16 queue)
> -{
> -	struct igb_adapter *adapter = netdev_priv(netdev);
> -	struct igb_ring *tx_ring;
> -
> -	tx_ring = __igb_tx_queue_mapping(adapter, queue);
> -
> -	writel(tx_ring->next_to_use, tx_ring->tail);
> -
> -	/* we need this if more than one processor can write to our tail
> -	 * at a time, it synchronizes IO on IA64/Altix systems
> -	 */
> -	mmiowb();
> -}
> -
>   /**
>    *  igb_tx_timeout - Respond to a Tx Hang
>    *  @netdev: network interface device structure
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6242108..f0c2824 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -953,15 +953,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>   		}
>   	}
>   
> -	return NETDEV_TX_OK;
> -}
> +	if (!skb->xmit_more)
> +		virtqueue_kick(sq->vq);
>   
> -static void xmit_flush(struct net_device *dev, u16 qnum)
> -{
> -	struct virtnet_info *vi = netdev_priv(dev);
> -	struct send_queue *sq = &vi->sq[qnum];
> -
> -	virtqueue_kick(sq->vq);
> +	return NETDEV_TX_OK;
>   }
>   
>   /*
> @@ -1393,7 +1388,6 @@ static const struct net_device_ops virtnet_netdev = {
>   	.ndo_open            = virtnet_open,
>   	.ndo_stop   	     = virtnet_close,
>   	.ndo_start_xmit      = start_xmit,
> -	.ndo_xmit_flush      = xmit_flush,
>   	.ndo_validate_addr   = eth_validate_addr,
>   	.ndo_set_mac_address = virtnet_set_mac_address,
>   	.ndo_set_rx_mode     = virtnet_set_rx_mode,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 220c509..039b237 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -782,19 +782,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>    *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
>    *	Required can not be NULL.
>    *
> - * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
> - *	A driver implements this function when it wishes to support
> - *	deferred TX queue flushing.  The idea is that the expensive
> - *	operation to trigger TX queue processing can be done after
> - *	N calls to ndo_start_xmit rather than being done every single
> - *	time.  In this regime ndo_start_xmit will be called one or more
> - *	times, and then a final ndo_xmit_flush call will be made to
> - *	have the driver tell the device about the new pending TX queue
> - *	entries.  The kernel keeps track of which queues need flushing
> - *	by monitoring skb->queue_mapping of the packets it submits to
> - *	ndo_start_xmit.  This is the queue value that will be passed
> - *	to ndo_xmit_flush.
> - *
>    * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
>    *                         void *accel_priv, select_queue_fallback_t fallback);
>    *	Called to decide which queue to when device supports multiple
> @@ -1018,7 +1005,6 @@ struct net_device_ops {
>   	int			(*ndo_stop)(struct net_device *dev);
>   	netdev_tx_t		(*ndo_start_xmit) (struct sk_buff *skb,
>   						   struct net_device *dev);
> -	void			(*ndo_xmit_flush)(struct net_device *dev, u16 queue);
>   	u16			(*ndo_select_queue)(struct net_device *dev,
>   						    struct sk_buff *skb,
>   						    void *accel_priv,
> @@ -3447,15 +3433,8 @@ int __init dev_proc_init(void);
>   static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
>   					      struct sk_buff *skb, struct net_device *dev)
>   {
> -	netdev_tx_t ret;
> -	u16 q;
> -
> -	q = skb->queue_mapping;
> -	ret = ops->ndo_start_xmit(skb, dev);
> -	if (dev_xmit_complete(ret) && ops->ndo_xmit_flush)
> -		ops->ndo_xmit_flush(dev, q);
> -
> -	return ret;
> +	skb->xmit_more = 0;
> +	return ops->ndo_start_xmit(skb, dev);
>   }
>   
>   static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 18ddf96..9b3802a 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -452,6 +452,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
>    *	@tc_verd: traffic control verdict
>    *	@hash: the packet hash
>    *	@queue_mapping: Queue mapping for multiqueue devices
> + *	@xmit_more: More SKBs are pending for this queue
>    *	@ndisc_nodetype: router type (from link layer)
>    *	@ooo_okay: allow the mapping of a socket to a queue to be changed
>    *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
> @@ -558,6 +559,7 @@ struct sk_buff {
>   
>   	__u16			queue_mapping;
>   	kmemcheck_bitfield_begin(flags2);
> +	__u8			xmit_more:1;
>   #ifdef CONFIG_IPV6_NDISC_NODETYPE
>   	__u8			ndisc_nodetype:2;
>   #endif

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 1, 2015, 7 a.m. UTC | #4
From: yzhu1 <Yanjun.Zhu@windriver.com>
Date: Tue, 1 Sep 2015 14:46:38 +0800

> After I applied this patch, the skb->xmit_more is not always zero.

There have been thousands upon thousands of commits since that
change.

You should be testing the tree as it currently stands, to see
if xmit_more behaves correctly or not.

If xmit_more were incorrectly set to 1 in the current tree, it
would stall the TX queue of the networking device and we would
be seeing lots of reports of this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
yzhu1 Sept. 1, 2015, 7:10 a.m. UTC | #5
On 09/01/2015 03:00 PM, David Miller wrote:
> From: yzhu1 <Yanjun.Zhu@windriver.com>
> Date: Tue, 1 Sep 2015 14:46:38 +0800
>
>> After I applied this patch, the skb->xmit_more is not always zero.
> There have been thousands upon thousands of commits since that
> change.
>
> You should be testing the tree as it currently stands, to see
> if xmit_more behaves correctly or not.
>
> If xmit_more were incorrectly set to 1 in the current tree, it
> would stall the TX queue of the networking device and we would
> be seeing lots of reports of this.
>
>
Thanks for your reply.
Yes. After running for several days, the following messages will appear.

igb 0000:09:00.0: Detected Tx Unit Hang
   Tx Queue <1>
   TDH <1a>
   TDT <1a>
   next_to_use <1d>
   next_to_clean <1a>
buffer_info[next_to_clean]
   time_stamp <ffffeb7d>
   next_to_watch <ffff88103ee711c0>
   jiffies <fffff324>
   desc.status <0>
igb 0000:09:00.0: Detected Tx Unit Hang
   Tx Queue <1>
   TDH <1a>
   TDT <1a>
   next_to_use <1d>
   next_to_clean <1a>
buffer_info[next_to_clean]
   time_stamp <ffffeb7d>
   next_to_watch <ffff88103ee711c0>
   jiffies <fffffaf4>
   desc.status <0>
igb 0000:09:00.0: Detected Tx Unit Hang
   Tx Queue <1>
   TDH <1a>
   TDT <1a>
   next_to_use <1d>
   next_to_clean <1a>
buffer_info[next_to_clean]
   time_stamp <ffffeb7d>
   next_to_watch <ffff88103ee711c0>
   jiffies <1000002c4>
   desc.status <0>
igb 0000:09:00.0: Detected Tx Unit Hang
   Tx Queue <1>
   TDH <1a>
   TDT <1a>
   next_to_use <1d>------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 
dev_watchdog+0x259/0x270()
NETDEV WATCHDOG: eth0 (igb): transmit queue 1 timed out
Modules linked in: x86_pkg_temp_thermal intel_powerclamp coretemp 
crct10dif_pclmul crct10dif_common aesni_intel aes_x86_64 glue_helper lrw 
gf128mul ablk_helper cryptd iTCO_wdt sb_edac iTCO_vendor_support ipmi_si 
edac_core i2c_i801 lpc_ich ipmi_msghandler nfsd fuse
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.29ltsi-WR7.0.0.0_standard #2
Hardware name: Intel Corporation S2600CP/S2600CP, BIOS 
RMLSDP.86I.R4.26.D674.1304190022 04/19/2013
  0000000000000009 ffff88081f603da0 ffffffff81ab9bb8 ffff88081f603de8
  ffff88081f603dd8 ffffffff8104c64d 0000000000000001 ffff880812f6d940
  0000000000000000 ffff880813efc000 0000000000000008 ffff88081f603e38
Call Trace:
  <IRQ> [<ffffffff81ab9bb8>] dump_stack+0x4e/0x7a
  [<ffffffff8104c64d>] warn_slowpath_common+0x7d/0xa0
  [<ffffffff8104c6bc>] warn_slowpath_fmt+0x4c/0x50
  [<ffffffff81ac09c7>] ? _raw_spin_unlock+0x17/0x30
  [<ffffffff81998659>] dev_watchdog+0x259/0x270
  [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
  [<ffffffff810594cb>] call_timer_fn+0x3b/0x170
  [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
  [<ffffffff81059d64>] run_timer_softirq+0x1c4/0x2d0
  [<ffffffff81051557>] __do_softirq+0xb7/0x2e0
  [<ffffffff810518be>] irq_exit+0x7e/0xa0
  [<ffffffff81acae74>] smp_apic_timer_interrupt+0x44/0x50
  [<ffffffff81ac9c4a>] apic_timer_interrupt+0x6a/0x70
  <EOI> [<ffffffff81880706>] ? cpuidle_enter_state+0x46/0xb0
  [<ffffffff8188082c>] cpuidle_idle_call+0xbc/0x250
  [<ffffffff8100cdce>] arch_cpu_idle+0xe/0x20
  [<ffffffff810a2bb5>] cpu_startup_entry+0x185/0x290
  [<ffffffff81ab4424>] rest_init+0x84/0x90
  [<ffffffff82333d50>] start_kernel+0x3d6/0x3e3
  [<ffffffff82333495>] x86_64_start_reservations+0x2a/0x2c
  [<ffffffff8233358e>] x86_64_start_kernel+0xf7/0xfa
---[ end trace 57ad9eaf9dd80dc2 ]---
igb 0000:09:00.0 eth0: Reset adapter
igb: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX
igb 0000:09:00.0: Detected Tx Unit Hang

   next_to_clean <1a>
buffer_info[next_to_clean]
   time_stamp <ffffeb7d>
   next_to_watch <ffff88103ee711c0>
   jiffies <100000a94>
   desc.status <0>



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 1, 2015, 7:13 a.m. UTC | #6
From: yzhu1 <Yanjun.Zhu@windriver.com>
Date: Tue, 1 Sep 2015 15:10:23 +0800

> After running for several days, the following messages will
> appear.

Then you need to figure out why the value is being set.

It is initialized to zero by every SKB allocation, and only
very specific controlled code paths set it non-zero.

Memory corruption is a very real possibility.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 1, 2015, 8:23 a.m. UTC | #7
On 09/01/2015 09:10 AM, yzhu1 wrote:
> On 09/01/2015 03:00 PM, David Miller wrote:
>> From: yzhu1 <Yanjun.Zhu@windriver.com>
>> Date: Tue, 1 Sep 2015 14:46:38 +0800
>>
>>> After I applied this patch, the skb->xmit_more is not always zero.
>> There have been thousands upon thousands of commits since that
>> change.
>>
>> You should be testing the tree as it currently stands, to see
>> if xmit_more behaves correctly or not.
>>
>> If xmit_more were incorrectly set to 1 in the current tree, it
>> would stall the TX queue of the networking device and we would
>> be seeing lots of reports of this.
>>
> Thanks for your reply.
> Yes. After running for several days, the following messages will appear.

Your below trace says 3.14.29ltsi-WR7.0.0.0 ...

As Dave said, please retest with something up to date, like 4.2 kernel,
or latest -net git tree.

Besides, the *upstream* xmit_more changes first went into 3.18 ...
nearest git describe is at:

   $ git describe 0b725a2ca61bedc33a2a63d0451d528b268cf975
   v3.17-rc1-251-g0b725a2

So, that only tells me, that you are reporting a possible bug based on
some non-upstream kernel ... ? Thus, it's not even possible to verify
if the actual backport was correct ?

> igb 0000:09:00.0: Detected Tx Unit Hang
>    Tx Queue <1>
>    TDH <1a>
>    TDT <1a>
>    next_to_use <1d>
>    next_to_clean <1a>
> buffer_info[next_to_clean]
>    time_stamp <ffffeb7d>
>    next_to_watch <ffff88103ee711c0>
>    jiffies <fffff324>
>    desc.status <0>
> igb 0000:09:00.0: Detected Tx Unit Hang
>    Tx Queue <1>
>    TDH <1a>
>    TDT <1a>
>    next_to_use <1d>
>    next_to_clean <1a>
> buffer_info[next_to_clean]
>    time_stamp <ffffeb7d>
>    next_to_watch <ffff88103ee711c0>
>    jiffies <fffffaf4>
>    desc.status <0>
> igb 0000:09:00.0: Detected Tx Unit Hang
>    Tx Queue <1>
>    TDH <1a>
>    TDT <1a>
>    next_to_use <1d>
>    next_to_clean <1a>
> buffer_info[next_to_clean]
>    time_stamp <ffffeb7d>
>    next_to_watch <ffff88103ee711c0>
>    jiffies <1000002c4>
>    desc.status <0>
> igb 0000:09:00.0: Detected Tx Unit Hang
>    Tx Queue <1>
>    TDH <1a>
>    TDT <1a>
>    next_to_use <1d>------------[ cut here ]------------
> WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 dev_watchdog+0x259/0x270()
> NETDEV WATCHDOG: eth0 (igb): transmit queue 1 timed out
> Modules linked in: x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crct10dif_common aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd iTCO_wdt sb_edac iTCO_vendor_support ipmi_si edac_core i2c_i801 lpc_ich ipmi_msghandler nfsd fuse
> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.29ltsi-WR7.0.0.0_standard #2
> Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.R4.26.D674.1304190022 04/19/2013
>   0000000000000009 ffff88081f603da0 ffffffff81ab9bb8 ffff88081f603de8
>   ffff88081f603dd8 ffffffff8104c64d 0000000000000001 ffff880812f6d940
>   0000000000000000 ffff880813efc000 0000000000000008 ffff88081f603e38
> Call Trace:
>   <IRQ> [<ffffffff81ab9bb8>] dump_stack+0x4e/0x7a
>   [<ffffffff8104c64d>] warn_slowpath_common+0x7d/0xa0
>   [<ffffffff8104c6bc>] warn_slowpath_fmt+0x4c/0x50
>   [<ffffffff81ac09c7>] ? _raw_spin_unlock+0x17/0x30
>   [<ffffffff81998659>] dev_watchdog+0x259/0x270
>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>   [<ffffffff810594cb>] call_timer_fn+0x3b/0x170
>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>   [<ffffffff81059d64>] run_timer_softirq+0x1c4/0x2d0
>   [<ffffffff81051557>] __do_softirq+0xb7/0x2e0
>   [<ffffffff810518be>] irq_exit+0x7e/0xa0
>   [<ffffffff81acae74>] smp_apic_timer_interrupt+0x44/0x50
>   [<ffffffff81ac9c4a>] apic_timer_interrupt+0x6a/0x70
>   <EOI> [<ffffffff81880706>] ? cpuidle_enter_state+0x46/0xb0
>   [<ffffffff8188082c>] cpuidle_idle_call+0xbc/0x250
>   [<ffffffff8100cdce>] arch_cpu_idle+0xe/0x20
>   [<ffffffff810a2bb5>] cpu_startup_entry+0x185/0x290
>   [<ffffffff81ab4424>] rest_init+0x84/0x90
>   [<ffffffff82333d50>] start_kernel+0x3d6/0x3e3
>   [<ffffffff82333495>] x86_64_start_reservations+0x2a/0x2c
>   [<ffffffff8233358e>] x86_64_start_kernel+0xf7/0xfa
> ---[ end trace 57ad9eaf9dd80dc2 ]---
> igb 0000:09:00.0 eth0: Reset adapter
> igb: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX
> igb 0000:09:00.0: Detected Tx Unit Hang
>
>    next_to_clean <1a>
> buffer_info[next_to_clean]
>    time_stamp <ffffeb7d>
>    next_to_watch <ffff88103ee711c0>
>    jiffies <100000a94>
>    desc.status <0>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
yzhu1 Sept. 1, 2015, 9:21 a.m. UTC | #8
On 09/01/2015 04:23 PM, Daniel Borkmann wrote:
> On 09/01/2015 09:10 AM, yzhu1 wrote:
>> On 09/01/2015 03:00 PM, David Miller wrote:
>>> From: yzhu1 <Yanjun.Zhu@windriver.com>
>>> Date: Tue, 1 Sep 2015 14:46:38 +0800
>>>
>>>> After I applied this patch, the skb->xmit_more is not always zero.
>>> There have been thousands upon thousands of commits since that
>>> change.
>>>
>>> You should be testing the tree as it currently stands, to see
>>> if xmit_more behaves correctly or not.
>>>
>>> If xmit_more were incorrectly set to 1 in the current tree, it
>>> would stall the TX queue of the networking device and we would
>>> be seeing lots of reports of this.
>>>
>> Thanks for your reply.
>> Yes. After running for several days, the following messages will appear.
>
> Your below trace says 3.14.29ltsi-WR7.0.0.0 ...
>
> As Dave said, please retest with something up to date, like 4.2 kernel,
> or latest -net git tree.
>
> Besides, the *upstream* xmit_more changes first went into 3.18 ...
> nearest git describe is at:
>
>   $ git describe 0b725a2ca61bedc33a2a63d0451d528b268cf975
>   v3.17-rc1-251-g0b725a2
>
> So, that only tells me, that you are reporting a possible bug based on
> some non-upstream kernel ... ? Thus, it's not even possible to verify
> if the actual backport was correct ?

Sorry. There is something wrong with backporting this patch.

Thanks for your help.

Zhu Yanjun

>
>> igb 0000:09:00.0: Detected Tx Unit Hang
>>    Tx Queue <1>
>>    TDH <1a>
>>    TDT <1a>
>>    next_to_use <1d>
>>    next_to_clean <1a>
>> buffer_info[next_to_clean]
>>    time_stamp <ffffeb7d>
>>    next_to_watch <ffff88103ee711c0>
>>    jiffies <fffff324>
>>    desc.status <0>
>> igb 0000:09:00.0: Detected Tx Unit Hang
>>    Tx Queue <1>
>>    TDH <1a>
>>    TDT <1a>
>>    next_to_use <1d>
>>    next_to_clean <1a>
>> buffer_info[next_to_clean]
>>    time_stamp <ffffeb7d>
>>    next_to_watch <ffff88103ee711c0>
>>    jiffies <fffffaf4>
>>    desc.status <0>
>> igb 0000:09:00.0: Detected Tx Unit Hang
>>    Tx Queue <1>
>>    TDH <1a>
>>    TDT <1a>
>>    next_to_use <1d>
>>    next_to_clean <1a>
>> buffer_info[next_to_clean]
>>    time_stamp <ffffeb7d>
>>    next_to_watch <ffff88103ee711c0>
>>    jiffies <1000002c4>
>>    desc.status <0>
>> igb 0000:09:00.0: Detected Tx Unit Hang
>>    Tx Queue <1>
>>    TDH <1a>
>>    TDT <1a>
>>    next_to_use <1d>------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 
>> dev_watchdog+0x259/0x270()
>> NETDEV WATCHDOG: eth0 (igb): transmit queue 1 timed out
>> Modules linked in: x86_pkg_temp_thermal intel_powerclamp coretemp 
>> crct10dif_pclmul crct10dif_common aesni_intel aes_x86_64 glue_helper 
>> lrw gf128mul ablk_helper cryptd iTCO_wdt sb_edac iTCO_vendor_support 
>> ipmi_si edac_core i2c_i801 lpc_ich ipmi_msghandler nfsd fuse
>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
>> 3.14.29ltsi-WR7.0.0.0_standard #2
>> Hardware name: Intel Corporation S2600CP/S2600CP, BIOS 
>> RMLSDP.86I.R4.26.D674.1304190022 04/19/2013
>>   0000000000000009 ffff88081f603da0 ffffffff81ab9bb8 ffff88081f603de8
>>   ffff88081f603dd8 ffffffff8104c64d 0000000000000001 ffff880812f6d940
>>   0000000000000000 ffff880813efc000 0000000000000008 ffff88081f603e38
>> Call Trace:
>>   <IRQ> [<ffffffff81ab9bb8>] dump_stack+0x4e/0x7a
>>   [<ffffffff8104c64d>] warn_slowpath_common+0x7d/0xa0
>>   [<ffffffff8104c6bc>] warn_slowpath_fmt+0x4c/0x50
>>   [<ffffffff81ac09c7>] ? _raw_spin_unlock+0x17/0x30
>>   [<ffffffff81998659>] dev_watchdog+0x259/0x270
>>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>>   [<ffffffff810594cb>] call_timer_fn+0x3b/0x170
>>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>>   [<ffffffff81059d64>] run_timer_softirq+0x1c4/0x2d0
>>   [<ffffffff81051557>] __do_softirq+0xb7/0x2e0
>>   [<ffffffff810518be>] irq_exit+0x7e/0xa0
>>   [<ffffffff81acae74>] smp_apic_timer_interrupt+0x44/0x50
>>   [<ffffffff81ac9c4a>] apic_timer_interrupt+0x6a/0x70
>>   <EOI> [<ffffffff81880706>] ? cpuidle_enter_state+0x46/0xb0
>>   [<ffffffff8188082c>] cpuidle_idle_call+0xbc/0x250
>>   [<ffffffff8100cdce>] arch_cpu_idle+0xe/0x20
>>   [<ffffffff810a2bb5>] cpu_startup_entry+0x185/0x290
>>   [<ffffffff81ab4424>] rest_init+0x84/0x90
>>   [<ffffffff82333d50>] start_kernel+0x3d6/0x3e3
>>   [<ffffffff82333495>] x86_64_start_reservations+0x2a/0x2c
>>   [<ffffffff8233358e>] x86_64_start_kernel+0xf7/0xfa
>> ---[ end trace 57ad9eaf9dd80dc2 ]---
>> igb 0000:09:00.0 eth0: Reset adapter
>> igb: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX
>> igb 0000:09:00.0: Detected Tx Unit Hang
>>
>>    next_to_clean <1a>
>> buffer_info[next_to_clean]
>>    time_stamp <ffffeb7d>
>>    next_to_watch <ffff88103ee711c0>
>>    jiffies <100000a94>
>>    desc.status <0>
>>
>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck Sept. 1, 2015, 4:22 p.m. UTC | #9
On 09/01/2015 02:21 AM, yzhu1 wrote:
> On 09/01/2015 04:23 PM, Daniel Borkmann wrote:
>> On 09/01/2015 09:10 AM, yzhu1 wrote:
>>> On 09/01/2015 03:00 PM, David Miller wrote:
>>>> From: yzhu1 <Yanjun.Zhu@windriver.com>
>>>> Date: Tue, 1 Sep 2015 14:46:38 +0800
>>>>
>>>>> After I applied this patch, the skb->xmit_more is not always zero.
>>>> There have been thousands upon thousands of commits since that
>>>> change.
>>>>
>>>> You should be testing the tree as it currently stands, to see
>>>> if xmit_more behaves correctly or not.
>>>>
>>>> If xmit_more were incorrectly set to 1 in the current tree, it
>>>> would stall the TX queue of the networking device and we would
>>>> be seeing lots of reports of this.
>>>>
>>> Thanks for your reply.
>>> Yes. After running for several days, the following messages will 
>>> appear.
>>
>> Your below trace says 3.14.29ltsi-WR7.0.0.0 ...
>>
>> As Dave said, please retest with something up to date, like 4.2 kernel,
>> or latest -net git tree.
>>
>> Besides, the *upstream* xmit_more changes first went into 3.18 ...
>> nearest git describe is at:
>>
>>   $ git describe 0b725a2ca61bedc33a2a63d0451d528b268cf975
>>   v3.17-rc1-251-g0b725a2
>>
>> So, that only tells me, that you are reporting a possible bug based on
>> some non-upstream kernel ... ? Thus, it's not even possible to verify
>> if the actual backport was correct ?
>
> Sorry. There is something wrong with backporting this patch.
>
> Thanks for your help.
>
> Zhu Yanjun

You may very well be missing the code that forced the tail to write if 
the Tx descriptor ring was full.  Double check your igb driver code and 
compare it to the upstream kernel.  You should double check and verify 
that you backported commit 6f19e12f62306 "igb: flush when in xmit_more 
mode and under descriptor pressure".

You need to have both checked before you skip writing the next 
descriptor to the ring.  The original code only checked one and this can 
result in Tx hangs if the ring fills without ever notifying the hardware.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Gortmaker Sept. 1, 2015, 4:49 p.m. UTC | #10
On Tue, Sep 1, 2015 at 5:21 AM, yzhu1 <Yanjun.Zhu@windriver.com> wrote:
> On 09/01/2015 04:23 PM, Daniel Borkmann wrote:
>>

[...]

>>
>> As Dave said, please retest with something up to date, like 4.2 kernel,
>> or latest -net git tree.
>>
>> Besides, the *upstream* xmit_more changes first went into 3.18 ...
>> nearest git describe is at:
>>
>>   $ git describe 0b725a2ca61bedc33a2a63d0451d528b268cf975
>>   v3.17-rc1-251-g0b725a2
>>
>> So, that only tells me, that you are reporting a possible bug based on
>> some non-upstream kernel ... ? Thus, it's not even possible to verify
>> if the actual backport was correct ?
>
>
> Sorry. There is something wrong with backporting this patch.

Please note for the future that this was a completely unacceptable
abuse of the upstream maintainers time by throwing this issue out
there like you did.   These folks all operate assuming everyone
is working on mainline master, and have no way to know what
random commits are in a custom tree -- especially if it isn't even
clear up front that it was a custom tree.   By not mentioning that,
you implicitly are tricking the maintainers into helping you, even
if that was not your actual intent.

If you see a problem, you need to ensure the problem exists on
the latest mainline, and then you can also possibly investigate
the specific commit at where it was added in mainline, to help
better understand the issue.  If you haven't done that, then you
can't be bothering the maintainers like this.  Please ensure this
is well understood by yourself and any others who might also
do the same by accident in the future.

Thanks,
Paul.
--

>
> Thanks for your help.
>
> Zhu Yanjun
>
>
>>
>>> igb 0000:09:00.0: Detected Tx Unit Hang
>>>    Tx Queue <1>
>>>    TDH <1a>
>>>    TDT <1a>
>>>    next_to_use <1d>
>>>    next_to_clean <1a>
>>> buffer_info[next_to_clean]
>>>    time_stamp <ffffeb7d>
>>>    next_to_watch <ffff88103ee711c0>
>>>    jiffies <fffff324>
>>>    desc.status <0>
>>> igb 0000:09:00.0: Detected Tx Unit Hang
>>>    Tx Queue <1>
>>>    TDH <1a>
>>>    TDT <1a>
>>>    next_to_use <1d>
>>>    next_to_clean <1a>
>>> buffer_info[next_to_clean]
>>>    time_stamp <ffffeb7d>
>>>    next_to_watch <ffff88103ee711c0>
>>>    jiffies <fffffaf4>
>>>    desc.status <0>
>>> igb 0000:09:00.0: Detected Tx Unit Hang
>>>    Tx Queue <1>
>>>    TDH <1a>
>>>    TDT <1a>
>>>    next_to_use <1d>
>>>    next_to_clean <1a>
>>> buffer_info[next_to_clean]
>>>    time_stamp <ffffeb7d>
>>>    next_to_watch <ffff88103ee711c0>
>>>    jiffies <1000002c4>
>>>    desc.status <0>
>>> igb 0000:09:00.0: Detected Tx Unit Hang
>>>    Tx Queue <1>
>>>    TDH <1a>
>>>    TDT <1a>
>>>    next_to_use <1d>------------[ cut here ]------------
>>> WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264
>>> dev_watchdog+0x259/0x270()
>>> NETDEV WATCHDOG: eth0 (igb): transmit queue 1 timed out
>>> Modules linked in: x86_pkg_temp_thermal intel_powerclamp coretemp
>>> crct10dif_pclmul crct10dif_common aesni_intel aes_x86_64 glue_helper lrw
>>> gf128mul ablk_helper cryptd iTCO_wdt sb_edac iTCO_vendor_support ipmi_si
>>> edac_core i2c_i801 lpc_ich ipmi_msghandler nfsd fuse
>>> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.29ltsi-WR7.0.0.0_standard
>>> #2
>>> Hardware name: Intel Corporation S2600CP/S2600CP, BIOS
>>> RMLSDP.86I.R4.26.D674.1304190022 04/19/2013
>>>   0000000000000009 ffff88081f603da0 ffffffff81ab9bb8 ffff88081f603de8
>>>   ffff88081f603dd8 ffffffff8104c64d 0000000000000001 ffff880812f6d940
>>>   0000000000000000 ffff880813efc000 0000000000000008 ffff88081f603e38
>>> Call Trace:
>>>   <IRQ> [<ffffffff81ab9bb8>] dump_stack+0x4e/0x7a
>>>   [<ffffffff8104c64d>] warn_slowpath_common+0x7d/0xa0
>>>   [<ffffffff8104c6bc>] warn_slowpath_fmt+0x4c/0x50
>>>   [<ffffffff81ac09c7>] ? _raw_spin_unlock+0x17/0x30
>>>   [<ffffffff81998659>] dev_watchdog+0x259/0x270
>>>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>>>   [<ffffffff810594cb>] call_timer_fn+0x3b/0x170
>>>   [<ffffffff81998400>] ? dev_graft_qdisc+0x80/0x80
>>>   [<ffffffff81059d64>] run_timer_softirq+0x1c4/0x2d0
>>>   [<ffffffff81051557>] __do_softirq+0xb7/0x2e0
>>>   [<ffffffff810518be>] irq_exit+0x7e/0xa0
>>>   [<ffffffff81acae74>] smp_apic_timer_interrupt+0x44/0x50
>>>   [<ffffffff81ac9c4a>] apic_timer_interrupt+0x6a/0x70
>>>   <EOI> [<ffffffff81880706>] ? cpuidle_enter_state+0x46/0xb0
>>>   [<ffffffff8188082c>] cpuidle_idle_call+0xbc/0x250
>>>   [<ffffffff8100cdce>] arch_cpu_idle+0xe/0x20
>>>   [<ffffffff810a2bb5>] cpu_startup_entry+0x185/0x290
>>>   [<ffffffff81ab4424>] rest_init+0x84/0x90
>>>   [<ffffffff82333d50>] start_kernel+0x3d6/0x3e3
>>>   [<ffffffff82333495>] x86_64_start_reservations+0x2a/0x2c
>>>   [<ffffffff8233358e>] x86_64_start_kernel+0xf7/0xfa
>>> ---[ end trace 57ad9eaf9dd80dc2 ]---
>>> igb 0000:09:00.0 eth0: Reset adapter
>>> igb: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX
>>> igb 0000:09:00.0: Detected Tx Unit Hang
>>>
>>>    next_to_clean <1a>
>>> buffer_info[next_to_clean]
>>>    time_stamp <ffffeb7d>
>>>    next_to_watch <ffff88103ee711c0>
>>>    jiffies <100000a94>
>>>    desc.status <0>
>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b9c020a..89c29b4 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -136,7 +136,6 @@  static void igb_update_phy_info(unsigned long);
 static void igb_watchdog(unsigned long);
 static void igb_watchdog_task(struct work_struct *);
 static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
-static void igb_xmit_flush(struct net_device *netdev, u16 queue);
 static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
 					  struct rtnl_link_stats64 *stats);
 static int igb_change_mtu(struct net_device *, int);
@@ -2076,7 +2075,6 @@  static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
 	.ndo_start_xmit		= igb_xmit_frame,
-	.ndo_xmit_flush		= igb_xmit_flush,
 	.ndo_get_stats64	= igb_get_stats64,
 	.ndo_set_rx_mode	= igb_set_rx_mode,
 	.ndo_set_mac_address	= igb_set_mac,
@@ -4917,6 +4915,14 @@  static void igb_tx_map(struct igb_ring *tx_ring,
 
 	tx_ring->next_to_use = i;
 
+	if (!skb->xmit_more) {
+		writel(i, tx_ring->tail);
+
+		/* we need this if more than one processor can write to our tail
+		 * at a time, it synchronizes IO on IA64/Altix systems
+		 */
+		mmiowb();
+	}
 	return;
 
 dma_error:
@@ -5052,20 +5058,17 @@  out_drop:
 	return NETDEV_TX_OK;
 }
 
-static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx)
+static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
+						    struct sk_buff *skb)
 {
+	unsigned int r_idx = skb->queue_mapping;
+
 	if (r_idx >= adapter->num_tx_queues)
 		r_idx = r_idx % adapter->num_tx_queues;
 
 	return adapter->tx_ring[r_idx];
 }
 
-static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
-						    struct sk_buff *skb)
-{
-	return __igb_tx_queue_mapping(adapter, skb->queue_mapping);
-}
-
 static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
 				  struct net_device *netdev)
 {
@@ -5094,21 +5097,6 @@  static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
 	return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
 }
 
-static void igb_xmit_flush(struct net_device *netdev, u16 queue)
-{
-	struct igb_adapter *adapter = netdev_priv(netdev);
-	struct igb_ring *tx_ring;
-
-	tx_ring = __igb_tx_queue_mapping(adapter, queue);
-
-	writel(tx_ring->next_to_use, tx_ring->tail);
-
-	/* we need this if more than one processor can write to our tail
-	 * at a time, it synchronizes IO on IA64/Altix systems
-	 */
-	mmiowb();
-}
-
 /**
  *  igb_tx_timeout - Respond to a Tx Hang
  *  @netdev: network interface device structure
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6242108..f0c2824 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -953,15 +953,10 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
-	return NETDEV_TX_OK;
-}
+	if (!skb->xmit_more)
+		virtqueue_kick(sq->vq);
 
-static void xmit_flush(struct net_device *dev, u16 qnum)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq = &vi->sq[qnum];
-
-	virtqueue_kick(sq->vq);
+	return NETDEV_TX_OK;
 }
 
 /*
@@ -1393,7 +1388,6 @@  static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
 	.ndo_start_xmit      = start_xmit,
-	.ndo_xmit_flush      = xmit_flush,
 	.ndo_validate_addr   = eth_validate_addr,
 	.ndo_set_mac_address = virtnet_set_mac_address,
 	.ndo_set_rx_mode     = virtnet_set_rx_mode,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 220c509..039b237 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -782,19 +782,6 @@  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
  *	Required can not be NULL.
  *
- * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
- *	A driver implements this function when it wishes to support
- *	deferred TX queue flushing.  The idea is that the expensive
- *	operation to trigger TX queue processing can be done after
- *	N calls to ndo_start_xmit rather than being done every single
- *	time.  In this regime ndo_start_xmit will be called one or more
- *	times, and then a final ndo_xmit_flush call will be made to
- *	have the driver tell the device about the new pending TX queue
- *	entries.  The kernel keeps track of which queues need flushing
- *	by monitoring skb->queue_mapping of the packets it submits to
- *	ndo_start_xmit.  This is the queue value that will be passed
- *	to ndo_xmit_flush.
- *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
  *                         void *accel_priv, select_queue_fallback_t fallback);
  *	Called to decide which queue to when device supports multiple
@@ -1018,7 +1005,6 @@  struct net_device_ops {
 	int			(*ndo_stop)(struct net_device *dev);
 	netdev_tx_t		(*ndo_start_xmit) (struct sk_buff *skb,
 						   struct net_device *dev);
-	void			(*ndo_xmit_flush)(struct net_device *dev, u16 queue);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
 						    void *accel_priv,
@@ -3447,15 +3433,8 @@  int __init dev_proc_init(void);
 static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 					      struct sk_buff *skb, struct net_device *dev)
 {
-	netdev_tx_t ret;
-	u16 q;
-
-	q = skb->queue_mapping;
-	ret = ops->ndo_start_xmit(skb, dev);
-	if (dev_xmit_complete(ret) && ops->ndo_xmit_flush)
-		ops->ndo_xmit_flush(dev, q);
-
-	return ret;
+	skb->xmit_more = 0;
+	return ops->ndo_start_xmit(skb, dev);
 }
 
 static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 18ddf96..9b3802a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -452,6 +452,7 @@  static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  *	@tc_verd: traffic control verdict
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
+ *	@xmit_more: More SKBs are pending for this queue
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -558,6 +559,7 @@  struct sk_buff {
 
 	__u16			queue_mapping;
 	kmemcheck_bitfield_begin(flags2);
+	__u8			xmit_more:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif