diff mbox

[net-next,2/5] gianfar: Cleanup and optimize struct gfar_private

Message ID 1360673237-349-2-git-send-email-claudiu.manoil@freescale.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Claudiu Manoil Feb. 12, 2013, 12:47 p.m. UTC
* group run-time critical fields within the 1st cacheline
  followed by the tx|rx_queue reference arrays and the interrupt
  group instances (gfargrp) (all cacheline aligned)
* change 'padding' from unsigned short to u16
* clear 20+ byte memory hole
* group releated members
* push non-critical fields at the end of the struct

Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com>
---
 drivers/net/ethernet/freescale/gianfar.h |   93 +++++++++++++++--------------
 1 files changed, 48 insertions(+), 45 deletions(-)

Comments

Paul Gortmaker Feb. 12, 2013, 3:11 p.m. UTC | #1
On 13-02-12 07:47 AM, Claudiu Manoil wrote:
> * group run-time critical fields within the 1st cacheline
>   followed by the tx|rx_queue reference arrays and the interrupt
>   group instances (gfargrp) (all cacheline aligned)
> * change 'padding' from unsigned short to u16
> * clear 20+ byte memory hole

Per prev. mail, it gets harder to see which change is where,
when they are all lumped together like this.  For example, it
wasn't obvious to me where the 20 byte hole was.  Also, it
doesn't look like you changed the padding, but rather instead
totally re-purposed it, leaving no alignment padding after the
uchar bitfields (where it was originally).

P.
--

> * group releated members
> * push non-critical fields at the end of the struct
> 
> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com>
> ---
>  drivers/net/ethernet/freescale/gianfar.h |   93 +++++++++++++++--------------
>  1 files changed, 48 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
> index 22c2f7a..5304a58 100644
> --- a/drivers/net/ethernet/freescale/gianfar.h
> +++ b/drivers/net/ethernet/freescale/gianfar.h
> @@ -1054,28 +1054,64 @@ enum gfar_errata {
>   * the buffer descriptor determines the actual condition.
>   */
>  struct gfar_private {
> -
> -	/* Indicates how many tx, rx queues are enabled */
> -	unsigned int num_tx_queues;
>  	unsigned int num_rx_queues;
> -	unsigned int num_grps;
> -	unsigned int mode;
> -
> -	/* The total tx and rx ring size for the enabled queues */
> -	unsigned int total_tx_ring_size;
> -	unsigned int total_rx_ring_size;
>  
>  	struct device *dev;
>  	struct net_device *ndev;
> -	struct platform_device *ofdev;
>  	enum gfar_errata errata;
> +	unsigned int rx_buffer_size;
> +
> +	u16 padding;
> +
> +	/* HW time stamping enabled flag */
> +	int hwts_rx_en;
> +	int hwts_tx_en;
>  
> -	struct gfar_priv_grp gfargrp[MAXGROUPS];
>  	struct gfar_priv_tx_q *tx_queue[MAX_TX_QS];
>  	struct gfar_priv_rx_q *rx_queue[MAX_RX_QS];
> +	struct gfar_priv_grp gfargrp[MAXGROUPS];
> +
> +	u32 device_flags;
> +
> +	unsigned int mode;
> +	unsigned int num_tx_queues;
> +	unsigned int num_grps;
> +
> +	/* Network Statistics */
> +	struct gfar_extra_stats extra_stats;
> +
> +	/* PHY stuff */
> +	phy_interface_t interface;
> +	struct device_node *phy_node;
> +	struct device_node *tbi_node;
> +	struct phy_device *phydev;
> +	struct mii_bus *mii_bus;
> +	int oldspeed;
> +	int oldduplex;
> +	int oldlink;
> +
> +	/* Bitfield update lock */
> +	spinlock_t bflock;
> +
> +	uint32_t msg_enable;
> +
> +	struct work_struct reset_task;
> +
> +	struct platform_device *ofdev;
> +	unsigned char
> +		extended_hash:1,
> +		bd_stash_en:1,
> +		rx_filer_enable:1,
> +		/* Wake-on-LAN enabled */
> +		wol_en:1,
> +		/* Enable priorty based Tx scheduling in Hw */
> +		prio_sched_en:1;
> +
> +	/* The total tx and rx ring size for the enabled queues */
> +	unsigned int total_tx_ring_size;
> +	unsigned int total_rx_ring_size;
>  
>  	/* RX per device parameters */
> -	unsigned int rx_buffer_size;
>  	unsigned int rx_stash_size;
>  	unsigned int rx_stash_index;
>  
> @@ -1094,39 +1130,6 @@ struct gfar_private {
>  	unsigned int fifo_starve;
>  	unsigned int fifo_starve_off;
>  
> -	/* Bitfield update lock */
> -	spinlock_t bflock;
> -
> -	phy_interface_t interface;
> -	struct device_node *phy_node;
> -	struct device_node *tbi_node;
> -	u32 device_flags;
> -	unsigned char
> -		extended_hash:1,
> -		bd_stash_en:1,
> -		rx_filer_enable:1,
> -		wol_en:1, /* Wake-on-LAN enabled */
> -		prio_sched_en:1; /* Enable priorty based Tx scheduling in Hw */
> -	unsigned short padding;
> -
> -	/* PHY stuff */
> -	struct phy_device *phydev;
> -	struct mii_bus *mii_bus;
> -	int oldspeed;
> -	int oldduplex;
> -	int oldlink;
> -
> -	uint32_t msg_enable;
> -
> -	struct work_struct reset_task;
> -
> -	/* Network Statistics */
> -	struct gfar_extra_stats extra_stats;
> -
> -	/* HW time stamping enabled flag */
> -	int hwts_rx_en;
> -	int hwts_tx_en;
> -
>  	/*Filer table*/
>  	unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
>  	unsigned int ftp_rqfcr[MAX_FILER_IDX + 1];
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Claudiu Manoil Feb. 12, 2013, 3:58 p.m. UTC | #2
On 2/12/2013 5:11 PM, Paul Gortmaker wrote:
> On 13-02-12 07:47 AM, Claudiu Manoil wrote:
>> * group run-time critical fields within the 1st cacheline
>>    followed by the tx|rx_queue reference arrays and the interrupt
>>    group instances (gfargrp) (all cacheline aligned)
>> * change 'padding' from unsigned short to u16
>> * clear 20+ byte memory hole
>
> Per prev. mail, it gets harder to see which change is where,
> when they are all lumped together like this.  For example, it
> wasn't obvious to me where the 20 byte hole was.  Also, it
> doesn't look like you changed the padding, but rather instead
> totally re-purposed it, leaving no alignment padding after the
> uchar bitfields (where it was originally).
>
> P.

The 20 byte hole was here:

struct gfar_private {
         unsigned int               num_tx_queues;
         unsigned int               num_rx_queues;
         unsigned int               num_grps;
         unsigned int               mode;
         unsigned int               total_tx_ring_size;
         unsigned int               total_rx_ring_size;
         struct device_node *       node;
         struct net_device *        ndev;
         /* --- cacheline 1 boundary (32 bytes) --- */
         struct platform_device *   ofdev;
         enum gfar_errata           errata;

         /* XXX 24 bytes hole, try to pack */

         /* --- cacheline 2 boundary (64 bytes) --- */
         struct gfar_priv_grp       gfargrp[2];


At the end of the patch series the first cacheline is without holes.
Please note that the re-grouping of members and their order is most
important. For instance why keep in the first cacheline something
as unimportant as total_rx_ring_size? Members like rx_buffer_size,
or padding, or even errata, are critical however for the fast path.
Rx processing (gfar_poll + clean_rx_ring) is the bottleneck here,
keeping the CPU to 100%. So the main goal is to optimize this path,
including memory access/cache optimizations. For instance, better 
results were obtained by inverting rx|tx_queue[] with gfargrp[], originally:
         /* --- cacheline 1 boundary (32 bytes) --- */
         struct gfar_priv_tx_q *    tx_queue[8];  /*    32    32 */
         /* --- cacheline 2 boundary (64 bytes) --- */
         struct gfar_priv_rx_q *    rx_queue[8]; /*    64    32 */
         /* --- cacheline 3 boundary (96 bytes) --- */
         struct gfar_priv_grp       gfargrp[2];  /*    96   192 */

The uchar bitfields are unimportant here (used at "init time"), and
they take 4 bytes including padding anyway. So whether it's uchar or
uint, it's the same, maybe better left uchar to discourage the abuse
of these bitfields. (A 2-3byte hole here doesn't change anything
to the whole structure size, which is padded to be at least a 32B
multiple.)

Thanks,
Claudiu


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Gortmaker Feb. 12, 2013, 4:49 p.m. UTC | #3
On 13-02-12 10:58 AM, Claudiu Manoil wrote:
> On 2/12/2013 5:11 PM, Paul Gortmaker wrote:
>> On 13-02-12 07:47 AM, Claudiu Manoil wrote:
>>> * group run-time critical fields within the 1st cacheline
>>>    followed by the tx|rx_queue reference arrays and the interrupt
>>>    group instances (gfargrp) (all cacheline aligned)
>>> * change 'padding' from unsigned short to u16
>>> * clear 20+ byte memory hole
>>
>> Per prev. mail, it gets harder to see which change is where,
>> when they are all lumped together like this.  For example, it
>> wasn't obvious to me where the 20 byte hole was.  Also, it
>> doesn't look like you changed the padding, but rather instead
>> totally re-purposed it, leaving no alignment padding after the
>> uchar bitfields (where it was originally).
>>
>> P.
> 
> The 20 byte hole was here:
> 
> struct gfar_private {
>          unsigned int               num_tx_queues;
>          unsigned int               num_rx_queues;
>          unsigned int               num_grps;
>          unsigned int               mode;
>          unsigned int               total_tx_ring_size;
>          unsigned int               total_rx_ring_size;
>          struct device_node *       node;
>          struct net_device *        ndev;
>          /* --- cacheline 1 boundary (32 bytes) --- */
>          struct platform_device *   ofdev;
>          enum gfar_errata           errata;
> 
>          /* XXX 24 bytes hole, try to pack */

OK, so this is good information, that wasn't in your commit log.
(Is it 20 or 24 though?  I'm guessing 24, and original commit
log was wrong in saying 20.)

Something like:

"The default gcc layout of gfar_private leaves an implicit 24
byte hole (bytes XYZ --> ABC in the 1st cache line) after the
errata enum...."

> 
>          /* --- cacheline 2 boundary (64 bytes) --- */
>          struct gfar_priv_grp       gfargrp[2];
> 
> 
> At the end of the patch series the first cacheline is without holes.
> Please note that the re-grouping of members and their order is most
> important. For instance why keep in the first cacheline something

The importance of ordering and alignment was never in question.  What
was missing, was conveying to the reviewer exactly how you'd made it
better, without forcing them to manually deconstruct gfar_priv in the
before and after cases, esp. when you'd already done that analysis.

> as unimportant as total_rx_ring_size? Members like rx_buffer_size,
> or padding, or even errata, are critical however for the fast path.
> Rx processing (gfar_poll + clean_rx_ring) is the bottleneck here,
> keeping the CPU to 100%. So the main goal is to optimize this path,
> including memory access/cache optimizations. For instance, better 
> results were obtained by inverting rx|tx_queue[] with gfargrp[], originally:
>          /* --- cacheline 1 boundary (32 bytes) --- */
>          struct gfar_priv_tx_q *    tx_queue[8];  /*    32    32 */
>          /* --- cacheline 2 boundary (64 bytes) --- */
>          struct gfar_priv_rx_q *    rx_queue[8]; /*    64    32 */
>          /* --- cacheline 3 boundary (96 bytes) --- */
>          struct gfar_priv_grp       gfargrp[2];  /*    96   192 */
> 
> The uchar bitfields are unimportant here (used at "init time"), and
> they take 4 bytes including padding anyway. So whether it's uchar or
> uint, it's the same, maybe better left uchar to discourage the abuse
> of these bitfields. (A 2-3byte hole here doesn't change anything
> to the whole structure size, which is padded to be at least a 32B
> multiple.)

This here too, i.e. why the padding after the bitfields is unimportant
and hence removed should also be in the long log.

Thanks,
Paul.
--

> 
> Thanks,
> Claudiu
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
index 22c2f7a..5304a58 100644
--- a/drivers/net/ethernet/freescale/gianfar.h
+++ b/drivers/net/ethernet/freescale/gianfar.h
@@ -1054,28 +1054,64 @@  enum gfar_errata {
  * the buffer descriptor determines the actual condition.
  */
 struct gfar_private {
-
-	/* Indicates how many tx, rx queues are enabled */
-	unsigned int num_tx_queues;
 	unsigned int num_rx_queues;
-	unsigned int num_grps;
-	unsigned int mode;
-
-	/* The total tx and rx ring size for the enabled queues */
-	unsigned int total_tx_ring_size;
-	unsigned int total_rx_ring_size;
 
 	struct device *dev;
 	struct net_device *ndev;
-	struct platform_device *ofdev;
 	enum gfar_errata errata;
+	unsigned int rx_buffer_size;
+
+	u16 padding;
+
+	/* HW time stamping enabled flag */
+	int hwts_rx_en;
+	int hwts_tx_en;
 
-	struct gfar_priv_grp gfargrp[MAXGROUPS];
 	struct gfar_priv_tx_q *tx_queue[MAX_TX_QS];
 	struct gfar_priv_rx_q *rx_queue[MAX_RX_QS];
+	struct gfar_priv_grp gfargrp[MAXGROUPS];
+
+	u32 device_flags;
+
+	unsigned int mode;
+	unsigned int num_tx_queues;
+	unsigned int num_grps;
+
+	/* Network Statistics */
+	struct gfar_extra_stats extra_stats;
+
+	/* PHY stuff */
+	phy_interface_t interface;
+	struct device_node *phy_node;
+	struct device_node *tbi_node;
+	struct phy_device *phydev;
+	struct mii_bus *mii_bus;
+	int oldspeed;
+	int oldduplex;
+	int oldlink;
+
+	/* Bitfield update lock */
+	spinlock_t bflock;
+
+	uint32_t msg_enable;
+
+	struct work_struct reset_task;
+
+	struct platform_device *ofdev;
+	unsigned char
+		extended_hash:1,
+		bd_stash_en:1,
+		rx_filer_enable:1,
+		/* Wake-on-LAN enabled */
+		wol_en:1,
+		/* Enable priorty based Tx scheduling in Hw */
+		prio_sched_en:1;
+
+	/* The total tx and rx ring size for the enabled queues */
+	unsigned int total_tx_ring_size;
+	unsigned int total_rx_ring_size;
 
 	/* RX per device parameters */
-	unsigned int rx_buffer_size;
 	unsigned int rx_stash_size;
 	unsigned int rx_stash_index;
 
@@ -1094,39 +1130,6 @@  struct gfar_private {
 	unsigned int fifo_starve;
 	unsigned int fifo_starve_off;
 
-	/* Bitfield update lock */
-	spinlock_t bflock;
-
-	phy_interface_t interface;
-	struct device_node *phy_node;
-	struct device_node *tbi_node;
-	u32 device_flags;
-	unsigned char
-		extended_hash:1,
-		bd_stash_en:1,
-		rx_filer_enable:1,
-		wol_en:1, /* Wake-on-LAN enabled */
-		prio_sched_en:1; /* Enable priorty based Tx scheduling in Hw */
-	unsigned short padding;
-
-	/* PHY stuff */
-	struct phy_device *phydev;
-	struct mii_bus *mii_bus;
-	int oldspeed;
-	int oldduplex;
-	int oldlink;
-
-	uint32_t msg_enable;
-
-	struct work_struct reset_task;
-
-	/* Network Statistics */
-	struct gfar_extra_stats extra_stats;
-
-	/* HW time stamping enabled flag */
-	int hwts_rx_en;
-	int hwts_tx_en;
-
 	/*Filer table*/
 	unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
 	unsigned int ftp_rqfcr[MAX_FILER_IDX + 1];