diff mbox

[net,3/7] net/mlx5e: Fix soft lockup when HW Timestamping is enabled

Message ID 1456312733-22091-4-git-send-email-saeedm@mellanox.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Saeed Mahameed Feb. 24, 2016, 11:18 a.m. UTC
From: Eran Ben Elisha <eranbe@mellanox.com>

Readers/Writers lock for SW timecounter was acquired without disabling
interrupts on local CPU.

The problematic scenario:
* HW timestamping is enabled
* Timestamp overflow periodic service task is running on local CPU and
  holding write_lock for SW timecounter
* Completion arrives, triggers interrupt for local CPU.
  Interrupt routine calls napi_schedule(), which triggers rx/tx
  skb process.
  An attempt to read SW timecounter using read_lock is done, which is
  already locked by a writer on the same CPU and cause soft lockup.

Add irqsave/irqrestore for when using the readers/writers lock.

Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support')
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |   30 ++++++++++++--------
 1 files changed, 18 insertions(+), 12 deletions(-)

Comments

David Miller Feb. 25, 2016, 9:31 p.m. UTC | #1
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 24 Feb 2016 13:18:49 +0200

> From: Eran Ben Elisha <eranbe@mellanox.com>
> 
> Readers/Writers lock for SW timecounter was acquired without disabling
> interrupts on local CPU.
> 
> The problematic scenario:
> * HW timestamping is enabled
> * Timestamp overflow periodic service task is running on local CPU and
>   holding write_lock for SW timecounter
> * Completion arrives, triggers interrupt for local CPU.
>   Interrupt routine calls napi_schedule(), which triggers rx/tx
>   skb process.
>   An attempt to read SW timecounter using read_lock is done, which is
>   already locked by a writer on the same CPU and cause soft lockup.
> 
> Add irqsave/irqrestore for when using the readers/writers lock.
> 
> Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support')
> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

I'd like to suggest a slight modification to this fix.

The canonical way to deal with this issue is to use IRQ disabling
for the write lock side, but not for the read lock side.

You only need to prevent a read lock via interrupt from nesting on
the same cpu where you took the write lock.

So you don't need the expensive IRQ disabling for the read lock side,
which is the more important path performance wise.

Thanks.
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index be65435..86f9f29 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -41,10 +41,11 @@  void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp,
 			struct skb_shared_hwtstamps *hwts)
 {
 	u64 nsec;
+	unsigned long flags;
 
-	read_lock(&tstamp->lock);
+	read_lock_irqsave(&tstamp->lock, flags);
 	nsec = timecounter_cyc2time(&tstamp->clock, timestamp);
-	read_unlock(&tstamp->lock);
+	read_unlock_irqrestore(&tstamp->lock, flags);
 
 	hwts->hwtstamp = ns_to_ktime(nsec);
 }
@@ -62,10 +63,11 @@  static void mlx5e_timestamp_overflow(struct work_struct *work)
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
 						   overflow_work);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_read(&tstamp->clock);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 	schedule_delayed_work(&tstamp->overflow_work, tstamp->overflow_period);
 }
 
@@ -136,10 +138,11 @@  static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
 	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }
@@ -150,10 +153,11 @@  static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
 	u64 ns;
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	ns = timecounter_read(&tstamp->clock);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
 
@@ -164,10 +168,11 @@  static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_adjtime(&tstamp->clock, delta);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }
@@ -176,6 +181,7 @@  static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
 	u64 adj;
 	u32 diff;
+	unsigned long flags;
 	int neg_adj = 0;
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						  ptp_info);
@@ -189,11 +195,11 @@  static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_read(&tstamp->clock);
 	tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
 					tstamp->nominal_c_mult + diff;
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }