diff mbox

[net-next,S4,13/15] i40e/i40evf: force inline transmit functions

Message ID 1429229172-143692-14-git-send-email-catherine.sullivan@intel.com
State Accepted
Delegated to: Jeff Kirsher
Headers show

Commit Message

Catherine Sullivan April 17, 2015, 12:06 a.m. UTC
From: Jesse Brandeburg <jesse.brandeburg@intel.com>

Inlining these functions gives us about 15% more 64 byte packets per
second when using pktgen. 13.3 million to 15 million with a single
queue.

Also fix the function names in i40evf to i40evf not i40e while we are
touching the function header.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Catherine Sullivan <catherine.sullivan@intel.com>
Change-ID: I3294ae9b085cf438672b6db5f9af122490ead9d0
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 32 ++++++++++++------------
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 36 ++++++++++++---------------
 2 files changed, 32 insertions(+), 36 deletions(-)

Comments

James Young May 5, 2015, 4:12 p.m. UTC | #1
> -----Original Message-----
> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@lists.osuosl.org] On
> Behalf Of Catherine Sullivan
> Sent: Thursday, April 16, 2015 5:06 PM
> To: intel-wired-lan@lists.osuosl.org
> Subject: [Intel-wired-lan] [net-next S4 13/15] i40e/i40evf: force inline
> transmit functions
> 
> From: Jesse Brandeburg <jesse.brandeburg@intel.com>
> 
> Inlining these functions gives us about 15% more 64 byte packets per second
> when using pktgen. 13.3 million to 15 million with a single queue.
> 
> Also fix the function names in i40evf to i40evf not i40e while we are touching
> the function header.
> 
> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Signed-off-by: Catherine Sullivan <catherine.sullivan@intel.com>
> Change-ID: I3294ae9b085cf438672b6db5f9af122490ead9d0
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 32 ++++++++++++------------
>  drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 36 ++++++++++++------------
> ---
>  2 files changed, 32 insertions(+), 36 deletions(-)

Tested-by: Jim Young <james.m.young@intel.com>
Alexander Duyck May 5, 2015, 4:55 p.m. UTC | #2
On 04/16/2015 05:06 PM, Catherine Sullivan wrote:
> From: Jesse Brandeburg <jesse.brandeburg@intel.com>
>
> Inlining these functions gives us about 15% more 64 byte packets per
> second when using pktgen. 13.3 million to 15 million with a single
> queue.
>
> Also fix the function names in i40evf to i40evf not i40e while we are
> touching the function header.
>
> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Signed-off-by: Catherine Sullivan <catherine.sullivan@intel.com>
> Change-ID: I3294ae9b085cf438672b6db5f9af122490ead9d0
> ---

The code below is ugly.  If you are going to inline it anyway you might 
as well just move the inline functions to a header file so that they can 
all be static inline.  Also what is the difference in size between the 
before/after with FCoE for this part enabled?  It would be useful to 
have that info in the patch description.  I suspect the change is fairly 
significant since you are doubling the map function which is fairly sizable.

I suspect most of these changes are only showing gains in the FCoE 
case.  Really you guys should look at reachitecting things rather than 
bloating the driver by doubling up the xmit path.  You could probably do 
a quick check against the netdev and then just call some FCoE specific 
offload path before getting back to i40e_tx_map.

Comments inline below.

- Alex

>   drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 32 ++++++++++++------------
>   drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 36 ++++++++++++---------------
>   2 files changed, 32 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index d6ad42b..ca21f9a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -2065,13 +2065,13 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
>    * otherwise  returns 0 to indicate the flags has been set properly.
>    **/
>   #ifdef I40E_FCOE
> -int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
> -			       struct i40e_ring *tx_ring,
> -			       u32 *flags)
> -#else
> -static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
> +inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
>   				      struct i40e_ring *tx_ring,
>   				      u32 *flags)
> +#else
> +static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
> +					     struct i40e_ring *tx_ring,
> +					     u32 *flags)
>   #endif
>   {
>   	__be16 protocol = skb->protocol;
> @@ -2414,9 +2414,9 @@ static inline int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
>    * Returns 0 if stop is not needed
>    **/
>   #ifdef I40E_FCOE
> -int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
> +inline int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
>   #else
> -static int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
> +static inline int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
>   #endif
>   {
>   	if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
> @@ -2496,13 +2496,13 @@ linearize_chk_done:
>    * @td_offset: offset for checksum or crc
>    **/
>   #ifdef I40E_FCOE
> -void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
> -		 struct i40e_tx_buffer *first, u32 tx_flags,
> -		 const u8 hdr_len, u32 td_cmd, u32 td_offset)
> -#else
> -static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
> +inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
>   			struct i40e_tx_buffer *first, u32 tx_flags,
>   			const u8 hdr_len, u32 td_cmd, u32 td_offset)
> +#else
> +static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
> +			       struct i40e_tx_buffer *first, u32 tx_flags,
> +			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
>   #endif
>   {
>   	unsigned int data_len = skb->data_len;

Inlining i40e_tx_map is kind of overkill isn't it?  Isn't this most of 
the code for the xmit path?

> @@ -2663,11 +2663,11 @@ dma_error:
>    * one descriptor.
>    **/
>   #ifdef I40E_FCOE
> -int i40e_xmit_descriptor_count(struct sk_buff *skb,
> -			       struct i40e_ring *tx_ring)
> -#else
> -static int i40e_xmit_descriptor_count(struct sk_buff *skb,
> +inline int i40e_xmit_descriptor_count(struct sk_buff *skb,
>   				      struct i40e_ring *tx_ring)
> +#else
> +static inline int i40e_xmit_descriptor_count(struct sk_buff *skb,
> +					     struct i40e_ring *tx_ring)
>   #endif
>   {
>   	unsigned int f;

So the VF code is just a copy of this w/ some renaming so I dropped it.  
I'm assuming the actual gains are none for this patch on the VF, do I 
have that correct?  I'm assuming the compiler is smart enough to realize 
it can inline static functions that are only called once.
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index d6ad42b..ca21f9a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2065,13 +2065,13 @@  static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
  * otherwise  returns 0 to indicate the flags has been set properly.
  **/
 #ifdef I40E_FCOE
-int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
-			       struct i40e_ring *tx_ring,
-			       u32 *flags)
-#else
-static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
+inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 				      struct i40e_ring *tx_ring,
 				      u32 *flags)
+#else
+static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
+					     struct i40e_ring *tx_ring,
+					     u32 *flags)
 #endif
 {
 	__be16 protocol = skb->protocol;
@@ -2414,9 +2414,9 @@  static inline int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
  * Returns 0 if stop is not needed
  **/
 #ifdef I40E_FCOE
-int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+inline int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
 #else
-static int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+static inline int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
 #endif
 {
 	if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
@@ -2496,13 +2496,13 @@  linearize_chk_done:
  * @td_offset: offset for checksum or crc
  **/
 #ifdef I40E_FCOE
-void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
-		 struct i40e_tx_buffer *first, u32 tx_flags,
-		 const u8 hdr_len, u32 td_cmd, u32 td_offset)
-#else
-static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
+inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 			struct i40e_tx_buffer *first, u32 tx_flags,
 			const u8 hdr_len, u32 td_cmd, u32 td_offset)
+#else
+static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
+			       struct i40e_tx_buffer *first, u32 tx_flags,
+			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
 #endif
 {
 	unsigned int data_len = skb->data_len;
@@ -2663,11 +2663,11 @@  dma_error:
  * one descriptor.
  **/
 #ifdef I40E_FCOE
-int i40e_xmit_descriptor_count(struct sk_buff *skb,
-			       struct i40e_ring *tx_ring)
-#else
-static int i40e_xmit_descriptor_count(struct sk_buff *skb,
+inline int i40e_xmit_descriptor_count(struct sk_buff *skb,
 				      struct i40e_ring *tx_ring)
+#else
+static inline int i40e_xmit_descriptor_count(struct sk_buff *skb,
+					     struct i40e_ring *tx_ring)
 #endif
 {
 	unsigned int f;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 4f77d38..fa2458b 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1349,7 +1349,7 @@  int i40evf_napi_poll(struct napi_struct *napi, int budget)
 }
 
 /**
- * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
+ * i40evf_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
  * @skb:     send buffer
  * @tx_ring: ring to send buffer on
  * @flags:   the tx flags to be set
@@ -1360,9 +1360,9 @@  int i40evf_napi_poll(struct napi_struct *napi, int budget)
  * Returns error code indicate the frame should be dropped upon error and the
  * otherwise  returns 0 to indicate the flags has been set properly.
  **/
-static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
-				      struct i40e_ring *tx_ring,
-				      u32 *flags)
+static inline int i40evf_tx_prepare_vlan_flags(struct sk_buff *skb,
+					       struct i40e_ring *tx_ring,
+					       u32 *flags)
 {
 	__be16 protocol = skb->protocol;
 	u32  tx_flags = 0;
@@ -1701,11 +1701,7 @@  static inline int __i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
  *
  * Returns 0 if stop is not needed
  **/
-#ifdef I40E_FCOE
-int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
-#else
-static int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
-#endif
+static inline int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
 {
 	if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
 		return 0;
@@ -1713,7 +1709,7 @@  static int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
 }
 
 /**
- * i40e_tx_map - Build the Tx descriptor
+ * i40evf_tx_map - Build the Tx descriptor
  * @tx_ring:  ring to send buffer on
  * @skb:      send buffer
  * @first:    first buffer info buffer to use
@@ -1722,9 +1718,9 @@  static int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
  * @td_cmd:   the command field in the descriptor
  * @td_offset: offset for checksum or crc
  **/
-static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
-			struct i40e_tx_buffer *first, u32 tx_flags,
-			const u8 hdr_len, u32 td_cmd, u32 td_offset)
+static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
+				 struct i40e_tx_buffer *first, u32 tx_flags,
+				 const u8 hdr_len, u32 td_cmd, u32 td_offset)
 {
 	unsigned int data_len = skb->data_len;
 	unsigned int size = skb_headlen(skb);
@@ -1876,7 +1872,7 @@  dma_error:
 }
 
 /**
- * i40e_xmit_descriptor_count - calculate number of tx descriptors needed
+ * i40evf_xmit_descriptor_count - calculate number of tx descriptors needed
  * @skb:     send buffer
  * @tx_ring: ring to send buffer on
  *
@@ -1884,8 +1880,8 @@  dma_error:
  * there is not enough descriptors available in this ring since we need at least
  * one descriptor.
  **/
-static int i40e_xmit_descriptor_count(struct sk_buff *skb,
-				      struct i40e_ring *tx_ring)
+static inline int i40evf_xmit_descriptor_count(struct sk_buff *skb,
+					       struct i40e_ring *tx_ring)
 {
 	unsigned int f;
 	int count = 0;
@@ -1926,11 +1922,11 @@  static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
 	u32 td_cmd = 0;
 	u8 hdr_len = 0;
 	int tso;
-	if (0 == i40e_xmit_descriptor_count(skb, tx_ring))
+	if (0 == i40evf_xmit_descriptor_count(skb, tx_ring))
 		return NETDEV_TX_BUSY;
 
 	/* prepare the xmit flags */
-	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
+	if (i40evf_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
 		goto out_drop;
 
 	/* obtain protocol of skb */
@@ -1973,8 +1969,8 @@  static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
 	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
 			   cd_tunneling, cd_l2tag2);
 
-	i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
-		    td_cmd, td_offset);
+	i40evf_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
+		      td_cmd, td_offset);
 
 	return NETDEV_TX_OK;