[net-next,20/25] fm10k: Add support for ITR scaling based on PCIe link speed
diff mbox

Message ID 1428092835-16834-20-git-send-email-jeffrey.t.kirsher@intel.com
State Changes Requested
Delegated to: Jeff Kirsher
Headers show

Commit Message

Jeff Kirsher April 3, 2015, 8:27 p.m. UTC
Fm10k's interrupt throttle timers are based on the PCIe link
speed. Because of this, the value being programmed into the ITR
registers must be scaled.

For the PF, this is as simple as reading the PCIe link speed and storing
the result. However, in the case of SR-IOV, the VF's interrupt throttle
timers are based on the link speed of the PF. However, the VF is unable
to get the link speed information from its configuration space, so the
PF must inform it of what scale to use.

Rather than pass this scale via mailbox message, take advantage of
unused bits in the TDLEN register to pass the scale. It is the
responsibility of the PF to program this for the VF while setting up the
VF queues and the responsibility of the VF to get the information
accordingly. This is preferable because it allows the VF to set up the
interrupts properly during initialization and matches how the MAC
address is passed in the TDBAL/TDBAH registers.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Matthew Vick <matthew.vick@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k.h      |  4 ++++
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 17 +++++++++++++----
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c  | 12 ++++++++++--
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c   | 16 +++++++++++++++-
 drivers/net/ethernet/intel/fm10k/fm10k_type.h |  6 ++++++
 drivers/net/ethernet/intel/fm10k/fm10k_vf.c   | 11 +++++++++--
 6 files changed, 57 insertions(+), 9 deletions(-)

Comments

Jeff Kirsher April 3, 2015, 9:06 p.m. UTC | #1
On Fri, 2015-04-03 at 13:27 -0700, Jeff Kirsher wrote:
> Fm10k's interrupt throttle timers are based on the PCIe link
> speed. Because of this, the value being programmed into the ITR
> registers must be scaled.
> 
> For the PF, this is as simple as reading the PCIe link speed and
> storing
> the result. However, in the case of SR-IOV, the VF's interrupt
> throttle
> timers are based on the link speed of the PF. However, the VF is
> unable
> to get the link speed information from its configuration space, so the
> PF must inform it of what scale to use.
> 
> Rather than pass this scale via mailbox message, take advantage of
> unused bits in the TDLEN register to pass the scale. It is the
> responsibility of the PF to program this for the VF while setting up
> the
> VF queues and the responsibility of the VF to get the information
> accordingly. This is preferable because it allows the VF to set up the
> interrupts properly during initialization and matches how the MAC
> address is passed in the TDBAL/TDBAH registers.
> 
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> Signed-off-by: Matthew Vick <matthew.vick@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k.h      |  4 ++++
>  drivers/net/ethernet/intel/fm10k/fm10k_main.c | 17 +++++++++++++----
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c  | 12 ++++++++++--
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c   | 16 +++++++++++++++-
>  drivers/net/ethernet/intel/fm10k/fm10k_type.h |  6 ++++++
>  drivers/net/ethernet/intel/fm10k/fm10k_vf.c   | 11 +++++++++--
>  6 files changed, 57 insertions(+), 9 deletions(-)

I have applied to my queue.
Alexander Duyck April 4, 2015, 6:16 p.m. UTC | #2
On 04/03/2015 01:27 PM, Jeff Kirsher wrote:
> Fm10k's interrupt throttle timers are based on the PCIe link
> speed. Because of this, the value being programmed into the ITR
> registers must be scaled.
> 
> For the PF, this is as simple as reading the PCIe link speed and storing
> the result. However, in the case of SR-IOV, the VF's interrupt throttle
> timers are based on the link speed of the PF. However, the VF is unable
> to get the link speed information from its configuration space, so the
> PF must inform it of what scale to use.
> 
> Rather than pass this scale via mailbox message, take advantage of
> unused bits in the TDLEN register to pass the scale. It is the
> responsibility of the PF to program this for the VF while setting up the
> VF queues and the responsibility of the VF to get the information
> accordingly. This is preferable because it allows the VF to set up the
> interrupts properly during initialization and matches how the MAC
> address is passed in the TDBAL/TDBAH registers.
> 
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> Signed-off-by: Matthew Vick <matthew.vick@intel.com>
> ---
>  drivers/net/ethernet/intel/fm10k/fm10k.h      |  4 ++++
>  drivers/net/ethernet/intel/fm10k/fm10k_main.c | 17 +++++++++++++----
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c  | 12 ++++++++++--
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c   | 16 +++++++++++++++-
>  drivers/net/ethernet/intel/fm10k/fm10k_type.h |  6 ++++++
>  drivers/net/ethernet/intel/fm10k/fm10k_vf.c   | 11 +++++++++--
>  6 files changed, 57 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
> index cc7f442e..feb53c0 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
> @@ -131,6 +131,7 @@ struct fm10k_ring {
>  					 * different for DCB and RSS modes
>  					 */
>  	u8 qos_pc;			/* priority class of queue */
> +	u8 itr_scale;			/* throttle scaler based on PCI speed */
>  	u16 vid;			/* default vlan ID of queue */
>  	u16 count;			/* amount of descriptors */
>  

This shouldn't be a part of the ring.  If it is related to the ITR it
really belongs in the ring container or the q_vector.

> @@ -164,6 +165,9 @@ struct fm10k_ring_container {
>  #define FM10K_ITR_10K		100	/* 100us */
>  #define FM10K_ITR_20K		50	/* 50us */
>  #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation flag */
> +#define FM10K_ITR_SCALE_SMALL	60	/* Constant factor for small frames */
> +#define FM10K_ITR_SCALE_MEDIUM	50	/* Constant factor for medium frames */
> +#define FM10K_ITR_SCALE_LARGE	40	/* Constant factor for large frames */
>  
>  #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
>  

These values don't make any sense to me, where did they come from?

> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> index 1e08832..cd2e86a 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
> @@ -1395,11 +1395,20 @@ static void fm10k_update_itr(struct fm10k_ring_container *ring_container)
>  	if (avg_wire_size > 3000)
>  		avg_wire_size = 3000;
>  
> -	/* Give a little boost to mid-size frames */
> -	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
> -		avg_wire_size /= 3;
> +	/* Simple throttle rate management based on average wire size,
> +	 * providing boosts to small and medium packet loads. Divide the
> +	 * average wire size by a constant factor to calculate the minimum time
> +	 * until the next interrupt in microseconds.
> +	 */
> +	if (avg_wire_size < 300)
> +		avg_wire_size /= FM10K_ITR_SCALE_SMALL;
> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
> +		avg_wire_size /= FM10K_ITR_SCALE_MEDIUM;
>  	else
> -		avg_wire_size /= 2;
> +		avg_wire_size /= FM10K_ITR_SCALE_LARGE;
> +
> +	/* Scale for various PCIe link speeds */
> +	avg_wire_size /= ring_container->ring->itr_scale;
>  
>  	/* write back value and retain adaptive flag */
>  	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;

This seems like all it is doing is maxing out the interrupt rate for all
cases.  For example, a value of 1514 is reduced to 37.8.  When you use
that as a usecs value that means the queue is capable of over 26K
interrupts per second.

The division by itr_scale is a really bad idea.  I would recommend
replacing it with a shift and you should probably check for the value
hitting 0.

I suspect you probably aren't seeing much of a penalty because the MSI-X
interrupt call is pretty cheap, however that is still a pretty high
interrupt rate compared to past parts.

> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> index cc527dd..c7c9832 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> @@ -532,6 +532,9 @@ static void fm10k_configure_tx_ring(struct fm10k_intfc *interface,
>  	/* store tail pointer */
>  	ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
>  
> +	/* store ITR scale */
> +	ring->itr_scale = hw->mac.itr_scale;
> +
>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>  	ring->next_to_clean = 0;
>  	ring->next_to_use = 0;
> @@ -638,6 +641,9 @@ static void fm10k_configure_rx_ring(struct fm10k_intfc *interface,
>  	/* store tail pointer */
>  	ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
>  
> +	/* store ITR scale */
> +	ring->itr_scale = hw->mac.itr_scale;
> +
>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>  	ring->next_to_clean = 0;
>  	ring->next_to_use = 0;
> @@ -824,7 +830,8 @@ static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data)
>  
>  	/* re-enable mailbox interrupt and indicate 20us delay */
>  	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
> +					    hw->mac.itr_scale));
>  
>  	/* service upstream mailbox */
>  	if (fm10k_mbx_trylock(interface)) {
> @@ -1007,7 +1014,8 @@ static irqreturn_t fm10k_msix_mbx_pf(int __always_unused irq, void *data)
>  
>  	/* re-enable mailbox interrupt and indicate 20us delay */
>  	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
> +					    hw->mac.itr_scale));
>  
>  	return IRQ_HANDLED;
>  }

You would be much better off here using the itr_scale as a multiple or a
shift value rather than doing a divide in an interrupt handler as a
divide can be quite expensive.

> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> index a92b58d..be80024 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
> @@ -144,16 +144,21 @@ static s32 fm10k_init_hw_pf(struct fm10k_hw *hw)
>  				FM10K_TPH_RXCTRL_HDR_WROEN);
>  	}
>  
> -	/* set max hold interval to align with 1.024 usec in all modes */
> +	/* set max hold interval to align with 1.024 usec in all modes and
> +	 * store ITR scale
> +	 */
>  	switch (hw->bus.speed) {
>  	case fm10k_bus_speed_2500:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN1;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN1;
>  		break;
>  	case fm10k_bus_speed_5000:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN2;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN2;
>  		break;
>  	case fm10k_bus_speed_8000:
>  		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN3;
> +		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
>  		break;
>  	default:
>  		dma_ctrl = 0;
> @@ -907,6 +912,12 @@ static s32 fm10k_iov_assign_default_mac_vlan_pf(struct fm10k_hw *hw,
>  	fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx), tdbal);
>  	fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx), tdbah);
>  
> +	/* Provide the VF the ITR scale, using software-defined fields in TDLEN
> +	 * to pass the information during VF initialization
> +	 */
> +	fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale <<
> +						   FM10K_TDLEN_ITR_SCALE_SHIFT);
> +
>  err_out:
>  	/* configure Queue control register */
>  	txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) &
> @@ -1039,6 +1050,9 @@ static s32 fm10k_iov_reset_resources_pf(struct fm10k_hw *hw,
>  	for (i = queues_per_pool; i--;) {
>  		fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx + i), tdbal);
>  		fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx + i), tdbah);
> +		fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx + i),
> +				hw->mac.itr_scale <<
> +				FM10K_TDLEN_ITR_SCALE_SHIFT);
>  		fm10k_write_reg(hw, FM10K_TQMAP(qmap_idx + i), vf_q_idx + i);
>  		fm10k_write_reg(hw, FM10K_RQMAP(qmap_idx + i), vf_q_idx + i);
>  	}
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> index 4af9668..c9a646d 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
> @@ -271,6 +271,11 @@ struct fm10k_hw;
>  #define FM10K_TDBAL(_n)		((0x40 * (_n)) + 0x8000)
>  #define FM10K_TDBAH(_n)		((0x40 * (_n)) + 0x8001)
>  #define FM10K_TDLEN(_n)		((0x40 * (_n)) + 0x8002)
> +#define FM10K_TDLEN_ITR_SCALE_SHIFT		9
> +#define FM10K_TDLEN_ITR_SCALE_MASK		0x00000E00
> +#define FM10K_TDLEN_ITR_SCALE_GEN1		4
> +#define FM10K_TDLEN_ITR_SCALE_GEN2		2
> +#define FM10K_TDLEN_ITR_SCALE_GEN3		1
>  #define FM10K_TPH_TXCTRL(_n)	((0x40 * (_n)) + 0x8003)
>  #define FM10K_TPH_TXCTRL_DESC_TPHEN		0x00000020
>  #define FM10K_TPH_TXCTRL_DESC_RROEN		0x00000200
> @@ -560,6 +565,7 @@ struct fm10k_mac_info {
>  	bool get_host_state;
>  	bool tx_ready;
>  	u32 dglort_map;
> +	u8 itr_scale;
>  };
>  
>  struct fm10k_swapi_table_info {
> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> index 94f0f6a..099f190 100644
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
> @@ -28,7 +28,7 @@
>  static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  {
>  	u8 *perm_addr = hw->mac.perm_addr;
> -	u32 bal = 0, bah = 0;
> +	u32 bal = 0, bah = 0, tdlen;
>  	s32 err;
>  	u16 i;
>  
> @@ -48,6 +48,9 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  		       ((u32)perm_addr[2]);
>  	}
>  
> +	/* restore default itr_scale for next VF initialization */
> +	tdlen = hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT;
> +
>  	/* The queues have already been disabled so we just need to
>  	 * update their base address registers
>  	 */
> @@ -56,6 +59,7 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>  		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
>  		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
>  		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
> +		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
>  	}
>  
>  	return 0;
> @@ -124,9 +128,12 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
>  	/* record maximum queue count */
>  	hw->mac.max_queues = i;
>  
> -	/* fetch default VLAN */
> +	/* fetch default VLAN and ITR scale */
>  	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
>  			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
> +	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
> +			     FM10K_TDLEN_ITR_SCALE_MASK) >>
> +			    FM10K_TDLEN_ITR_SCALE_SHIFT;
>  
>  	return 0;
>  }
> 

This is a good reason to get rid of the divide in favor of a shift.  If
a VF didn't restore the tdlen value using the stop_hw_vf function then
you could potentially have one driver load trip up the next and cause a
divide by 0.

- Alex
Vick, Matthew April 6, 2015, 4:39 p.m. UTC | #3
On 4/4/15, 11:16 AM, "Alexander Duyck" <alexander.duyck@gmail.com> wrote:

>On 04/03/2015 01:27 PM, Jeff Kirsher wrote:

[...]

>> 
>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h
>>b/drivers/net/ethernet/intel/fm10k/fm10k.h
>> index cc7f442e..feb53c0 100644
>> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
>> @@ -131,6 +131,7 @@ struct fm10k_ring {
>>  					 * different for DCB and RSS modes
>>  					 */
>>  	u8 qos_pc;			/* priority class of queue */
>> +	u8 itr_scale;			/* throttle scaler based on PCI speed */
>>  	u16 vid;			/* default vlan ID of queue */
>>  	u16 count;			/* amount of descriptors */
>>  
>
>This shouldn't be a part of the ring.  If it is related to the ITR it
>really belongs in the ring container or the q_vector.

Fair enough--I definitely went back and forth on where to put it. I think
I'll put it in the ring container, since the ITR already gets updated
there. It's just mildly annoying during the initialization routines is all.

>> @@ -164,6 +165,9 @@ struct fm10k_ring_container {
>>  #define FM10K_ITR_10K		100	/* 100us */
>>  #define FM10K_ITR_20K		50	/* 50us */
>>  #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation
>>flag */
>> +#define FM10K_ITR_SCALE_SMALL	60	/* Constant factor for small frames */
>> +#define FM10K_ITR_SCALE_MEDIUM	50	/* Constant factor for medium frames
>>*/
>> +#define FM10K_ITR_SCALE_LARGE	40	/* Constant factor for large frames */
>>  
>>  #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
>>  
>
>These values don't make any sense to me, where did they come from?

They were experimental. I'll be the first to admit they probably aren't
perfect, but they're at least a significant improvement.

Do you have some recommendations for interrupt rates at 25G/50G? Depending
on what the targets are, it'd be nice to get these scalars to something we
can shift by instead of divide by.

>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>> index 1e08832..cd2e86a 100644
>> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>> @@ -1395,11 +1395,20 @@ static void fm10k_update_itr(struct
>>fm10k_ring_container *ring_container)
>>  	if (avg_wire_size > 3000)
>>  		avg_wire_size = 3000;
>>  
>> -	/* Give a little boost to mid-size frames */
>> -	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
>> -		avg_wire_size /= 3;
>> +	/* Simple throttle rate management based on average wire size,
>> +	 * providing boosts to small and medium packet loads. Divide the
>> +	 * average wire size by a constant factor to calculate the minimum
>>time
>> +	 * until the next interrupt in microseconds.
>> +	 */
>> +	if (avg_wire_size < 300)
>> +		avg_wire_size /= FM10K_ITR_SCALE_SMALL;
>> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
>> +		avg_wire_size /= FM10K_ITR_SCALE_MEDIUM;
>>  	else
>> -		avg_wire_size /= 2;
>> +		avg_wire_size /= FM10K_ITR_SCALE_LARGE;
>> +
>> +	/* Scale for various PCIe link speeds */
>> +	avg_wire_size /= ring_container->ring->itr_scale;
>>  
>>  	/* write back value and retain adaptive flag */
>>  	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
>
>This seems like all it is doing is maxing out the interrupt rate for all
>cases.  For example, a value of 1514 is reduced to 37.8.  When you use
>that as a usecs value that means the queue is capable of over 26K
>interrupts per second.
>
>The division by itr_scale is a really bad idea.  I would recommend
>replacing it with a shift and you should probably check for the value
>hitting 0.
>
>I suspect you probably aren't seeing much of a penalty because the MSI-X
>interrupt call is pretty cheap, however that is still a pretty high
>interrupt rate compared to past parts.

The interrupt rates are definitely open to suggestion. I tried to maintain
the idea of the original algorithm, which did basically the same thing
(avg_wire_size divided by some scalar), so I'm not seeing what's
significantly different here from that angle.

A shift for the itr_scale is a fair suggestion. I'll incorporate that into
a v2.

I definitely understand that I went with some high interrupt values, but
given the architecture of the device I'd be skeptical about dropping all
the way to 8000int/s.

>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>>b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>> index cc527dd..c7c9832 100644
>> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>> @@ -532,6 +532,9 @@ static void fm10k_configure_tx_ring(struct
>>fm10k_intfc *interface,
>>  	/* store tail pointer */
>>  	ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
>>  
>> +	/* store ITR scale */
>> +	ring->itr_scale = hw->mac.itr_scale;
>> +
>>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>>  	ring->next_to_clean = 0;
>>  	ring->next_to_use = 0;
>> @@ -638,6 +641,9 @@ static void fm10k_configure_rx_ring(struct
>>fm10k_intfc *interface,
>>  	/* store tail pointer */
>>  	ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
>>  
>> +	/* store ITR scale */
>> +	ring->itr_scale = hw->mac.itr_scale;
>> +
>>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>>  	ring->next_to_clean = 0;
>>  	ring->next_to_use = 0;
>> @@ -824,7 +830,8 @@ static irqreturn_t fm10k_msix_mbx_vf(int
>>__always_unused irq, void *data)
>>  
>>  	/* re-enable mailbox interrupt and indicate 20us delay */
>>  	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
>> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
>> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
>> +					    hw->mac.itr_scale));
>>  
>>  	/* service upstream mailbox */
>>  	if (fm10k_mbx_trylock(interface)) {
>> @@ -1007,7 +1014,8 @@ static irqreturn_t fm10k_msix_mbx_pf(int
>>__always_unused irq, void *data)
>>  
>>  	/* re-enable mailbox interrupt and indicate 20us delay */
>>  	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
>> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
>> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
>> +					    hw->mac.itr_scale));
>>  
>>  	return IRQ_HANDLED;
>>  }
>
>You would be much better off here using the itr_scale as a multiple or a
>shift value rather than doing a divide in an interrupt handler as a
>divide can be quite expensive.

Will fix for v2.

>> @@ -56,6 +59,7 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>>  		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
>>  		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
>>  		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
>> +		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
>>  	}
>>  
>>  	return 0;
>> @@ -124,9 +128,12 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
>>  	/* record maximum queue count */
>>  	hw->mac.max_queues = i;
>>  
>> -	/* fetch default VLAN */
>> +	/* fetch default VLAN and ITR scale */
>>  	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
>>  			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
>> +	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
>> +			     FM10K_TDLEN_ITR_SCALE_MASK) >>
>> +			    FM10K_TDLEN_ITR_SCALE_SHIFT;
>>  
>>  	return 0;
>>  }
>> 
>
>This is a good reason to get rid of the divide in favor of a shift.  If
>a VF didn't restore the tdlen value using the stop_hw_vf function then
>you could potentially have one driver load trip up the next and cause a
>divide by 0.

Agreed, and sure enough we have a commit in the works that assumes a
default value if we read zero. I originally went with the divide for
readability's sake, but the shift is probably the better choice, so I'll
send a v2 with that.

If you have some suggestions for an interrupt range, I can look into
incorporating those as well.

Thank you for the review, Alex!

Cheers,
Matthew
Alexander Duyck April 6, 2015, 5:05 p.m. UTC | #4
On 04/06/2015 09:39 AM, Vick, Matthew wrote:
> On 4/4/15, 11:16 AM, "Alexander Duyck" <alexander.duyck@gmail.com> wrote:
>
>> On 04/03/2015 01:27 PM, Jeff Kirsher wrote:
> [...]
>
>>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h
>>> b/drivers/net/ethernet/intel/fm10k/fm10k.h
>>> index cc7f442e..feb53c0 100644
>>> --- a/drivers/net/ethernet/intel/fm10k/fm10k.h
>>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
>>> @@ -131,6 +131,7 @@ struct fm10k_ring {
>>>  					 * different for DCB and RSS modes
>>>  					 */
>>>  	u8 qos_pc;			/* priority class of queue */
>>> +	u8 itr_scale;			/* throttle scaler based on PCI speed */
>>>  	u16 vid;			/* default vlan ID of queue */
>>>  	u16 count;			/* amount of descriptors */
>>>  
>> This shouldn't be a part of the ring.  If it is related to the ITR it
>> really belongs in the ring container or the q_vector.
> Fair enough--I definitely went back and forth on where to put it. I think
> I'll put it in the ring container, since the ITR already gets updated
> there. It's just mildly annoying during the initialization routines is all.
>
>>> @@ -164,6 +165,9 @@ struct fm10k_ring_container {
>>>  #define FM10K_ITR_10K		100	/* 100us */
>>>  #define FM10K_ITR_20K		50	/* 50us */
>>>  #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation
>>> flag */
>>> +#define FM10K_ITR_SCALE_SMALL	60	/* Constant factor for small frames */
>>> +#define FM10K_ITR_SCALE_MEDIUM	50	/* Constant factor for medium frames
>>> */
>>> +#define FM10K_ITR_SCALE_LARGE	40	/* Constant factor for large frames */
>>>  
>>>  #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
>>>  
>> These values don't make any sense to me, where did they come from?
> They were experimental. I'll be the first to admit they probably aren't
> perfect, but they're at least a significant improvement.
>
> Do you have some recommendations for interrupt rates at 25G/50G? Depending
> on what the targets are, it'd be nice to get these scalars to something we
> can shift by instead of divide by.

If nothing else these need to be documented along with the expected
range of interrupts per second.

So for example if you expect the FM10K_ITR_SCALE_SMALL of 60 to be used
with packets 300 bytes and smaller you should document somewhere that
this is going to drive rates of 1 million to 200 thousand interrupts per
second if that is what you actually want.  I was assuming that was a bug
since that value seems obscenely high, however at 50Gb/s I supposed it
is possible that you could theoretically push 70Mpps so in theory that
might be appropriate if we were actually processing small packets at
line rate..  :-)

>>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>> b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>> index 1e08832..cd2e86a 100644
>>> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>> @@ -1395,11 +1395,20 @@ static void fm10k_update_itr(struct
>>> fm10k_ring_container *ring_container)
>>>  	if (avg_wire_size > 3000)
>>>  		avg_wire_size = 3000;
>>>  
>>> -	/* Give a little boost to mid-size frames */
>>> -	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
>>> -		avg_wire_size /= 3;
>>> +	/* Simple throttle rate management based on average wire size,
>>> +	 * providing boosts to small and medium packet loads. Divide the
>>> +	 * average wire size by a constant factor to calculate the minimum
>>> time
>>> +	 * until the next interrupt in microseconds.
>>> +	 */
>>> +	if (avg_wire_size < 300)
>>> +		avg_wire_size /= FM10K_ITR_SCALE_SMALL;
>>> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
>>> +		avg_wire_size /= FM10K_ITR_SCALE_MEDIUM;
>>>  	else
>>> -		avg_wire_size /= 2;
>>> +		avg_wire_size /= FM10K_ITR_SCALE_LARGE;
>>> +
>>> +	/* Scale for various PCIe link speeds */
>>> +	avg_wire_size /= ring_container->ring->itr_scale;
>>>  
>>>  	/* write back value and retain adaptive flag */
>>>  	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
>> This seems like all it is doing is maxing out the interrupt rate for all
>> cases.  For example, a value of 1514 is reduced to 37.8.  When you use
>> that as a usecs value that means the queue is capable of over 26K
>> interrupts per second.
>>
>> The division by itr_scale is a really bad idea.  I would recommend
>> replacing it with a shift and you should probably check for the value
>> hitting 0.
>>
>> I suspect you probably aren't seeing much of a penalty because the MSI-X
>> interrupt call is pretty cheap, however that is still a pretty high
>> interrupt rate compared to past parts.
> The interrupt rates are definitely open to suggestion. I tried to maintain
> the idea of the original algorithm, which did basically the same thing
> (avg_wire_size divided by some scalar), so I'm not seeing what's
> significantly different here from that angle.
>
> A shift for the itr_scale is a fair suggestion. I'll incorporate that into
> a v2.
>
> I definitely understand that I went with some high interrupt values, but
> given the architecture of the device I'd be skeptical about dropping all
> the way to 8000int/s.

I think the big issue with this is that there is no bottom end.  So if
you have a setup where you are running small packets your adjustment
brings it all the way down to 1us since 60/60 is 1.  Then you also have
to do a shift by 2 or divide by 4 for the PCIe gen1 which will bring the
value down to 0 which isn't correct.

What you probably need to do is look at creating a limit on just how low
a value you can have.  Since you are having to do a divide by 4 after
the computation you might make 4us the lowest value you can generate for
this algorithm, and probably make that the limit, excluding ITR
disabled, for user controllable interrupt moderation as well.  Then that
way things are always at least throttled to no more than 250K
interrupts/second per queue.  Otherwise there is s good chance that
everything with packet size smaller than 240B will just end up with no
interrupt moderation since that is currently being shifted off into
oblivion.

>
>>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>>> b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>>> index cc527dd..c7c9832 100644
>>> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
>>> @@ -532,6 +532,9 @@ static void fm10k_configure_tx_ring(struct
>>> fm10k_intfc *interface,
>>>  	/* store tail pointer */
>>>  	ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
>>>  
>>> +	/* store ITR scale */
>>> +	ring->itr_scale = hw->mac.itr_scale;
>>> +
>>>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>>>  	ring->next_to_clean = 0;
>>>  	ring->next_to_use = 0;
>>> @@ -638,6 +641,9 @@ static void fm10k_configure_rx_ring(struct
>>> fm10k_intfc *interface,
>>>  	/* store tail pointer */
>>>  	ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
>>>  
>>> +	/* store ITR scale */
>>> +	ring->itr_scale = hw->mac.itr_scale;
>>> +
>>>  	/* reset ntu and ntc to place SW in sync with hardwdare */
>>>  	ring->next_to_clean = 0;
>>>  	ring->next_to_use = 0;
>>> @@ -824,7 +830,8 @@ static irqreturn_t fm10k_msix_mbx_vf(int
>>> __always_unused irq, void *data)
>>>  
>>>  	/* re-enable mailbox interrupt and indicate 20us delay */
>>>  	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
>>> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
>>> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
>>> +					    hw->mac.itr_scale));
>>>  
>>>  	/* service upstream mailbox */
>>>  	if (fm10k_mbx_trylock(interface)) {
>>> @@ -1007,7 +1014,8 @@ static irqreturn_t fm10k_msix_mbx_pf(int
>>> __always_unused irq, void *data)
>>>  
>>>  	/* re-enable mailbox interrupt and indicate 20us delay */
>>>  	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
>>> -			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
>>> +			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
>>> +					    hw->mac.itr_scale));
>>>  
>>>  	return IRQ_HANDLED;
>>>  }
>> You would be much better off here using the itr_scale as a multiple or a
>> shift value rather than doing a divide in an interrupt handler as a
>> divide can be quite expensive.
> Will fix for v2.
>
>>> @@ -56,6 +59,7 @@ static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
>>>  		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
>>>  		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
>>>  		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
>>> +		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
>>>  	}
>>>  
>>>  	return 0;
>>> @@ -124,9 +128,12 @@ static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
>>>  	/* record maximum queue count */
>>>  	hw->mac.max_queues = i;
>>>  
>>> -	/* fetch default VLAN */
>>> +	/* fetch default VLAN and ITR scale */
>>>  	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
>>>  			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
>>> +	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
>>> +			     FM10K_TDLEN_ITR_SCALE_MASK) >>
>>> +			    FM10K_TDLEN_ITR_SCALE_SHIFT;
>>>  
>>>  	return 0;
>>>  }
>>>
>> This is a good reason to get rid of the divide in favor of a shift.  If
>> a VF didn't restore the tdlen value using the stop_hw_vf function then
>> you could potentially have one driver load trip up the next and cause a
>> divide by 0.
> Agreed, and sure enough we have a commit in the works that assumes a
> default value if we read zero. I originally went with the divide for
> readability's sake, but the shift is probably the better choice, so I'll
> send a v2 with that.
>
> If you have some suggestions for an interrupt range, I can look into
> incorporating those as well.
>
> Thank you for the review, Alex!
>
> Cheers,
> Matthew
>
Vick, Matthew April 6, 2015, 6:33 p.m. UTC | #5
On 4/6/15, 10:05 AM, "Alexander Duyck" <alexander.duyck@gmail.com> wrote:

>On 04/06/2015 09:39 AM, Vick, Matthew wrote:
>> On 4/4/15, 11:16 AM, "Alexander Duyck" <alexander.duyck@gmail.com>
>>wrote:
>>
>>> On 04/03/2015 01:27 PM, Jeff Kirsher wrote:

[...]

>>>> @@ -164,6 +165,9 @@ struct fm10k_ring_container {
>>>>  #define FM10K_ITR_10K		100	/* 100us */
>>>>  #define FM10K_ITR_20K		50	/* 50us */
>>>>  #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation
>>>> flag */
>>>> +#define FM10K_ITR_SCALE_SMALL	60	/* Constant factor for small frames
>>>>*/
>>>> +#define FM10K_ITR_SCALE_MEDIUM	50	/* Constant factor for medium
>>>>frames
>>>> */
>>>> +#define FM10K_ITR_SCALE_LARGE	40	/* Constant factor for large frames
>>>>*/
>>>>  
>>>>  #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
>>>>  
>>> These values don't make any sense to me, where did they come from?
>> They were experimental. I'll be the first to admit they probably aren't
>> perfect, but they're at least a significant improvement.
>>
>> Do you have some recommendations for interrupt rates at 25G/50G?
>>Depending
>> on what the targets are, it'd be nice to get these scalars to something
>>we
>> can shift by instead of divide by.
>
>If nothing else these need to be documented along with the expected
>range of interrupts per second.
>
>So for example if you expect the FM10K_ITR_SCALE_SMALL of 60 to be used
>with packets 300 bytes and smaller you should document somewhere that
>this is going to drive rates of 1 million to 200 thousand interrupts per
>second if that is what you actually want.  I was assuming that was a bug
>since that value seems obscenely high, however at 50Gb/s I supposed it
>is possible that you could theoretically push 70Mpps so in theory that
>might be appropriate if we were actually processing small packets at
>line rate..  :-)

That, um, was totally intentional. Yeah, 1 million interrupts or bust--I
totally didn't mess up the math when I had intended it to go up to 100k
int/s. :) Will definitely fix (and document) in v2.

>>>> diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>>> b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>>> index 1e08832..cd2e86a 100644
>>>> --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>>> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
>>>> @@ -1395,11 +1395,20 @@ static void fm10k_update_itr(struct
>>>> fm10k_ring_container *ring_container)
>>>>  	if (avg_wire_size > 3000)
>>>>  		avg_wire_size = 3000;
>>>>  
>>>> -	/* Give a little boost to mid-size frames */
>>>> -	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
>>>> -		avg_wire_size /= 3;
>>>> +	/* Simple throttle rate management based on average wire size,
>>>> +	 * providing boosts to small and medium packet loads. Divide the
>>>> +	 * average wire size by a constant factor to calculate the minimum
>>>> time
>>>> +	 * until the next interrupt in microseconds.
>>>> +	 */
>>>> +	if (avg_wire_size < 300)
>>>> +		avg_wire_size /= FM10K_ITR_SCALE_SMALL;
>>>> +	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
>>>> +		avg_wire_size /= FM10K_ITR_SCALE_MEDIUM;
>>>>  	else
>>>> -		avg_wire_size /= 2;
>>>> +		avg_wire_size /= FM10K_ITR_SCALE_LARGE;
>>>> +
>>>> +	/* Scale for various PCIe link speeds */
>>>> +	avg_wire_size /= ring_container->ring->itr_scale;
>>>>  
>>>>  	/* write back value and retain adaptive flag */
>>>>  	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
>>> This seems like all it is doing is maxing out the interrupt rate for
>>>all
>>> cases.  For example, a value of 1514 is reduced to 37.8.  When you use
>>> that as a usecs value that means the queue is capable of over 26K
>>> interrupts per second.
>>>
>>> The division by itr_scale is a really bad idea.  I would recommend
>>> replacing it with a shift and you should probably check for the value
>>> hitting 0.
>>>
>>> I suspect you probably aren't seeing much of a penalty because the
>>>MSI-X
>>> interrupt call is pretty cheap, however that is still a pretty high
>>> interrupt rate compared to past parts.
>> The interrupt rates are definitely open to suggestion. I tried to
>>maintain
>> the idea of the original algorithm, which did basically the same thing
>> (avg_wire_size divided by some scalar), so I'm not seeing what's
>> significantly different here from that angle.
>>
>> A shift for the itr_scale is a fair suggestion. I'll incorporate that
>>into
>> a v2.
>>
>> I definitely understand that I went with some high interrupt values, but
>> given the architecture of the device I'd be skeptical about dropping all
>> the way to 8000int/s.
>
>I think the big issue with this is that there is no bottom end.  So if
>you have a setup where you are running small packets your adjustment
>brings it all the way down to 1us since 60/60 is 1.  Then you also have
>to do a shift by 2 or divide by 4 for the PCIe gen1 which will bring the
>value down to 0 which isn't correct.
>
>What you probably need to do is look at creating a limit on just how low
>a value you can have.  Since you are having to do a divide by 4 after
>the computation you might make 4us the lowest value you can generate for
>this algorithm, and probably make that the limit, excluding ITR
>disabled, for user controllable interrupt moderation as well.  Then that
>way things are always at least throttled to no more than 250K
>interrupts/second per queue.  Otherwise there is s good chance that
>everything with packet size smaller than 240B will just end up with no
>interrupt moderation since that is currently being shifted off into
>oblivion.

And that's very much a bug I'll make sure to get fixed.

The accidentally unmoderated user input case is a good catch as well! I'll
add some lower bounds based on the PCIe link speed to the set_coalesce
call.

Patch
diff mbox

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index cc7f442e..feb53c0 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -131,6 +131,7 @@  struct fm10k_ring {
 					 * different for DCB and RSS modes
 					 */
 	u8 qos_pc;			/* priority class of queue */
+	u8 itr_scale;			/* throttle scaler based on PCI speed */
 	u16 vid;			/* default vlan ID of queue */
 	u16 count;			/* amount of descriptors */
 
@@ -164,6 +165,9 @@  struct fm10k_ring_container {
 #define FM10K_ITR_10K		100	/* 100us */
 #define FM10K_ITR_20K		50	/* 50us */
 #define FM10K_ITR_ADAPTIVE	0x8000	/* adaptive interrupt moderation flag */
+#define FM10K_ITR_SCALE_SMALL	60	/* Constant factor for small frames */
+#define FM10K_ITR_SCALE_MEDIUM	50	/* Constant factor for medium frames */
+#define FM10K_ITR_SCALE_LARGE	40	/* Constant factor for large frames */
 
 #define FM10K_ITR_ENABLE	(FM10K_ITR_AUTOMASK | FM10K_ITR_MASK_CLEAR)
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 1e08832..cd2e86a 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1395,11 +1395,20 @@  static void fm10k_update_itr(struct fm10k_ring_container *ring_container)
 	if (avg_wire_size > 3000)
 		avg_wire_size = 3000;
 
-	/* Give a little boost to mid-size frames */
-	if ((avg_wire_size > 300) && (avg_wire_size < 1200))
-		avg_wire_size /= 3;
+	/* Simple throttle rate management based on average wire size,
+	 * providing boosts to small and medium packet loads. Divide the
+	 * average wire size by a constant factor to calculate the minimum time
+	 * until the next interrupt in microseconds.
+	 */
+	if (avg_wire_size < 300)
+		avg_wire_size /= FM10K_ITR_SCALE_SMALL;
+	else if ((avg_wire_size >= 300) && (avg_wire_size < 1200))
+		avg_wire_size /= FM10K_ITR_SCALE_MEDIUM;
 	else
-		avg_wire_size /= 2;
+		avg_wire_size /= FM10K_ITR_SCALE_LARGE;
+
+	/* Scale for various PCIe link speeds */
+	avg_wire_size /= ring_container->ring->itr_scale;
 
 	/* write back value and retain adaptive flag */
 	ring_container->itr = avg_wire_size | FM10K_ITR_ADAPTIVE;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index cc527dd..c7c9832 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -532,6 +532,9 @@  static void fm10k_configure_tx_ring(struct fm10k_intfc *interface,
 	/* store tail pointer */
 	ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
 
+	/* store ITR scale */
+	ring->itr_scale = hw->mac.itr_scale;
+
 	/* reset ntu and ntc to place SW in sync with hardwdare */
 	ring->next_to_clean = 0;
 	ring->next_to_use = 0;
@@ -638,6 +641,9 @@  static void fm10k_configure_rx_ring(struct fm10k_intfc *interface,
 	/* store tail pointer */
 	ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
 
+	/* store ITR scale */
+	ring->itr_scale = hw->mac.itr_scale;
+
 	/* reset ntu and ntc to place SW in sync with hardwdare */
 	ring->next_to_clean = 0;
 	ring->next_to_use = 0;
@@ -824,7 +830,8 @@  static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data)
 
 	/* re-enable mailbox interrupt and indicate 20us delay */
 	fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
-			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
+			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
+					    hw->mac.itr_scale));
 
 	/* service upstream mailbox */
 	if (fm10k_mbx_trylock(interface)) {
@@ -1007,7 +1014,8 @@  static irqreturn_t fm10k_msix_mbx_pf(int __always_unused irq, void *data)
 
 	/* re-enable mailbox interrupt and indicate 20us delay */
 	fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
-			FM10K_ITR_ENABLE | FM10K_MBX_INT_DELAY);
+			FM10K_ITR_ENABLE | (FM10K_MBX_INT_DELAY /
+					    hw->mac.itr_scale));
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index a92b58d..be80024 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -144,16 +144,21 @@  static s32 fm10k_init_hw_pf(struct fm10k_hw *hw)
 				FM10K_TPH_RXCTRL_HDR_WROEN);
 	}
 
-	/* set max hold interval to align with 1.024 usec in all modes */
+	/* set max hold interval to align with 1.024 usec in all modes and
+	 * store ITR scale
+	 */
 	switch (hw->bus.speed) {
 	case fm10k_bus_speed_2500:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN1;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN1;
 		break;
 	case fm10k_bus_speed_5000:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN2;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN2;
 		break;
 	case fm10k_bus_speed_8000:
 		dma_ctrl = FM10K_DMA_CTRL_MAX_HOLD_1US_GEN3;
+		hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
 		break;
 	default:
 		dma_ctrl = 0;
@@ -907,6 +912,12 @@  static s32 fm10k_iov_assign_default_mac_vlan_pf(struct fm10k_hw *hw,
 	fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx), tdbal);
 	fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx), tdbah);
 
+	/* Provide the VF the ITR scale, using software-defined fields in TDLEN
+	 * to pass the information during VF initialization
+	 */
+	fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale <<
+						   FM10K_TDLEN_ITR_SCALE_SHIFT);
+
 err_out:
 	/* configure Queue control register */
 	txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) &
@@ -1039,6 +1050,9 @@  static s32 fm10k_iov_reset_resources_pf(struct fm10k_hw *hw,
 	for (i = queues_per_pool; i--;) {
 		fm10k_write_reg(hw, FM10K_TDBAL(vf_q_idx + i), tdbal);
 		fm10k_write_reg(hw, FM10K_TDBAH(vf_q_idx + i), tdbah);
+		fm10k_write_reg(hw, FM10K_TDLEN(vf_q_idx + i),
+				hw->mac.itr_scale <<
+				FM10K_TDLEN_ITR_SCALE_SHIFT);
 		fm10k_write_reg(hw, FM10K_TQMAP(qmap_idx + i), vf_q_idx + i);
 		fm10k_write_reg(hw, FM10K_RQMAP(qmap_idx + i), vf_q_idx + i);
 	}
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
index 4af9668..c9a646d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
@@ -271,6 +271,11 @@  struct fm10k_hw;
 #define FM10K_TDBAL(_n)		((0x40 * (_n)) + 0x8000)
 #define FM10K_TDBAH(_n)		((0x40 * (_n)) + 0x8001)
 #define FM10K_TDLEN(_n)		((0x40 * (_n)) + 0x8002)
+#define FM10K_TDLEN_ITR_SCALE_SHIFT		9
+#define FM10K_TDLEN_ITR_SCALE_MASK		0x00000E00
+#define FM10K_TDLEN_ITR_SCALE_GEN1		4
+#define FM10K_TDLEN_ITR_SCALE_GEN2		2
+#define FM10K_TDLEN_ITR_SCALE_GEN3		1
 #define FM10K_TPH_TXCTRL(_n)	((0x40 * (_n)) + 0x8003)
 #define FM10K_TPH_TXCTRL_DESC_TPHEN		0x00000020
 #define FM10K_TPH_TXCTRL_DESC_RROEN		0x00000200
@@ -560,6 +565,7 @@  struct fm10k_mac_info {
 	bool get_host_state;
 	bool tx_ready;
 	u32 dglort_map;
+	u8 itr_scale;
 };
 
 struct fm10k_swapi_table_info {
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
index 94f0f6a..099f190 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
@@ -28,7 +28,7 @@ 
 static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 {
 	u8 *perm_addr = hw->mac.perm_addr;
-	u32 bal = 0, bah = 0;
+	u32 bal = 0, bah = 0, tdlen;
 	s32 err;
 	u16 i;
 
@@ -48,6 +48,9 @@  static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 		       ((u32)perm_addr[2]);
 	}
 
+	/* restore default itr_scale for next VF initialization */
+	tdlen = hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT;
+
 	/* The queues have already been disabled so we just need to
 	 * update their base address registers
 	 */
@@ -56,6 +59,7 @@  static s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
 		fm10k_write_reg(hw, FM10K_TDBAH(i), bah);
 		fm10k_write_reg(hw, FM10K_RDBAL(i), bal);
 		fm10k_write_reg(hw, FM10K_RDBAH(i), bah);
+		fm10k_write_reg(hw, FM10K_TDLEN(i), tdlen);
 	}
 
 	return 0;
@@ -124,9 +128,12 @@  static s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 	/* record maximum queue count */
 	hw->mac.max_queues = i;
 
-	/* fetch default VLAN */
+	/* fetch default VLAN and ITR scale */
 	hw->mac.default_vid = (fm10k_read_reg(hw, FM10K_TXQCTL(0)) &
 			       FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
+	hw->mac.itr_scale = (fm10k_read_reg(hw, FM10K_TDLEN(0)) &
+			     FM10K_TDLEN_ITR_SCALE_MASK) >>
+			    FM10K_TDLEN_ITR_SCALE_SHIFT;
 
 	return 0;
 }