diff mbox series

[v2,1/3] spi: tegra210-quad: Convert to hard IRQ with high-priority workqueue

Message ID 20260519155108.4092518-2-va@nvidia.com
State New
Headers show
Series spi: tegra210-quad: Improve interrupt handling for loaded systems | expand

Commit Message

Vishwaroop A May 19, 2026, 3:51 p.m. UTC
Threaded IRQ handlers suffer from scheduler latency on heavily loaded
systems, causing false transfer timeouts. Convert to hard IRQ handler
that schedules work on a high-priority unbound workqueue.

The hard IRQ handler verifies the interrupt, caches FIFO status,
clears and masks interrupts, then schedules bottom-half processing.
The workqueue handler runs in process context (can sleep for DMA)
and can execute on any CPU, avoiding CPU0 bottlenecks.

Signed-off-by: Vishwaroop A <va@nvidia.com>
---
 drivers/spi/spi-tegra210-quad.c | 128 +++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 44 deletions(-)

Comments

Jon Hunter May 20, 2026, 9:22 a.m. UTC | #1
On 19/05/2026 16:51, Vishwaroop A wrote:
> Threaded IRQ handlers suffer from scheduler latency on heavily loaded
> systems, causing false transfer timeouts. Convert to hard IRQ handler
> that schedules work on a high-priority unbound workqueue.
> 
> The hard IRQ handler verifies the interrupt, caches FIFO status,
> clears and masks interrupts, then schedules bottom-half processing.
> The workqueue handler runs in process context (can sleep for DMA)
> and can execute on any CPU, avoiding CPU0 bottlenecks.
> 
> Signed-off-by: Vishwaroop A <va@nvidia.com>
> ---
>   drivers/spi/spi-tegra210-quad.c | 128 +++++++++++++++++++++-----------
>   1 file changed, 84 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/spi/spi-tegra210-quad.c b/drivers/spi/spi-tegra210-quad.c
> index db28dd556484..17d0b511af1d 100644
> --- a/drivers/spi/spi-tegra210-quad.c
> +++ b/drivers/spi/spi-tegra210-quad.c
> @@ -191,6 +191,8 @@ struct tegra_qspi {
>   	void __iomem				*base;
>   	phys_addr_t				phys;
>   	unsigned int				irq;
> +	struct work_struct			irq_work;
> +	struct workqueue_struct			*wq;
>   
>   	u32					cur_speed;
>   	unsigned int				cur_pos;
> @@ -1225,9 +1227,9 @@ static int tegra_qspi_combined_seq_xfer(struct tegra_qspi *tqspi,
>   
>   			if (ret == 0) {
>   				/*
> -				 * Check if hardware completed the transfer
> -				 * even though interrupt was lost or delayed.
> -				 * If so, process the completion and continue.
> +				 * Check if hardware completed the transfer even though
> +				 * workqueue was delayed. If so, process completion and
> +				 * continue.
>   				 */
>   				ret = tegra_qspi_handle_timeout(tqspi);
>   				if (ret < 0) {
> @@ -1344,8 +1346,8 @@ static int tegra_qspi_non_combined_seq_xfer(struct tegra_qspi *tqspi,
>   		if (ret == 0) {
>   			/*
>   			 * Check if hardware completed the transfer even though
> -			 * interrupt was lost or delayed. If so, process the
> -			 * completion and continue.
> +			 * workqueue was delayed. If so, process completion and
> +			 * continue.
>   			 */
>   			ret = tegra_qspi_handle_timeout(tqspi);
>   			if (ret < 0) {
> @@ -1574,46 +1576,40 @@ static irqreturn_t handle_dma_based_xfer(struct tegra_qspi *tqspi)
>   	return IRQ_HANDLED;
>   }
>   
> -static irqreturn_t tegra_qspi_isr_thread(int irq, void *context_data)
> +/**
> + * tegra_qspi_work_handler - Workqueue handler for interrupt bottom-half
> + * @work: work_struct embedded in tegra_qspi
> + *
> + * Runs in process context and can sleep (needed for DMA completion waits).
> + * Can run on any available CPU, avoiding CPU0 bottleneck that occurs with
> + * threaded IRQ handlers which are pinned to the IRQ's CPU.
> + *
> + * The hard IRQ handler has already:
> + * - Verified this is our interrupt (QSPI_RDY was set)
> + * - Cached FIFO status in tqspi->status_reg
> + * - Parsed tx_status / rx_status from FIFO status
> + * - Masked further interrupts
> + */
> +static void tegra_qspi_work_handler(struct work_struct *work)
>   {
> -	struct tegra_qspi *tqspi = context_data;
> +	struct tegra_qspi *tqspi = container_of(work, struct tegra_qspi, irq_work);
>   	unsigned long flags;
> -	u32 status;
>   
> -	/*
> -	 * Read transfer status to check if interrupt was triggered by transfer
> -	 * completion
> -	 */
> -	status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
> +	spin_lock_irqsave(&tqspi->lock, flags);
>   
>   	/*
> -	 * Occasionally the IRQ thread takes a long time to wake up (usually
> -	 * when the CPU that it's running on is excessively busy) and we have
> -	 * already reached the timeout before and cleaned up the timed out
> -	 * transfer. Avoid any processing in that case and bail out early.
> -	 *
> -	 * If no transfer is in progress, check if this was a real interrupt
> -	 * that the timeout handler already processed, or a spurious one.
> +	 * Check if timeout handler already processed this transfer.
> +	 * Can happen if work was delayed and timeout fired first. If
> +	 * so, we must unmask interrupts before returning, otherwise
> +	 * they remain masked from the hard IRQ handler and the next
> +	 * transfer will timeout.
>   	 */
> -	spin_lock_irqsave(&tqspi->lock, flags);
>   	if (!tqspi->curr_xfer) {
>   		spin_unlock_irqrestore(&tqspi->lock, flags);
> -		/* Spurious interrupt - transfer not ready */
> -		if (!(status & QSPI_RDY))
> -			return IRQ_NONE;
> -		/* Real interrupt, already handled by timeout path */
> -		return IRQ_HANDLED;
> +		tegra_qspi_unmask_irq(tqspi);
> +		return;
>   	}
>   
> -	tqspi->status_reg = tegra_qspi_readl(tqspi, QSPI_FIFO_STATUS);
> -
> -	if (tqspi->cur_direction & DATA_DIR_TX)
> -		tqspi->tx_status = tqspi->status_reg & (QSPI_TX_FIFO_UNF | QSPI_TX_FIFO_OVF);
> -
> -	if (tqspi->cur_direction & DATA_DIR_RX)
> -		tqspi->rx_status = tqspi->status_reg & (QSPI_RX_FIFO_OVF | QSPI_RX_FIFO_UNF);
> -
> -	tegra_qspi_mask_clear_irq(tqspi);
>   	spin_unlock_irqrestore(&tqspi->lock, flags);
>   
>   	/*
> @@ -1623,9 +1619,46 @@ static irqreturn_t tegra_qspi_isr_thread(int irq, void *context_data)
>   	 * cannot be done while holding spinlock.
>   	 */
>   	if (!tqspi->is_curr_dma_xfer)
> -		return handle_cpu_based_xfer(tqspi);
> +		handle_cpu_based_xfer(tqspi);
> +	else
> +		handle_dma_based_xfer(tqspi);
> +}
> +
> +/**
> + * tegra_qspi_isr - Hard IRQ handler
> + * @irq: IRQ number
> + * @context_data: QSPI controller instance
> + *
> + * Runs in hard IRQ context with minimal latency. Cannot sleep.
> + *
> + * Return: IRQ_NONE if not our interrupt, IRQ_HANDLED if handled
> + */
> +static irqreturn_t tegra_qspi_isr(int irq, void *context_data)
> +{
> +	struct tegra_qspi *tqspi = context_data;
> +	u32 status;
> +
> +	status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
> +	if (!(status & QSPI_RDY))
> +		return IRQ_NONE;
> +
> +	spin_lock(&tqspi->lock);
> +	tqspi->status_reg = tegra_qspi_readl(tqspi, QSPI_FIFO_STATUS);
> +	tegra_qspi_mask_clear_irq(tqspi);
>   
> -	return handle_dma_based_xfer(tqspi);
> +	if (tqspi->cur_direction & DATA_DIR_TX)
> +		tqspi->tx_status = tqspi->status_reg &
> +				    (QSPI_TX_FIFO_UNF | QSPI_TX_FIFO_OVF);
> +
> +	if (tqspi->cur_direction & DATA_DIR_RX)
> +		tqspi->rx_status = tqspi->status_reg &
> +				    (QSPI_RX_FIFO_OVF | QSPI_RX_FIFO_UNF);
> +
> +	spin_unlock(&tqspi->lock);
> +
> +	queue_work(tqspi->wq, &tqspi->irq_work);
> +
> +	return IRQ_HANDLED;
>   }
>   
>   static struct tegra_qspi_soc_data tegra210_qspi_soc_data = {
> @@ -1793,9 +1826,19 @@ static int tegra_qspi_probe(struct platform_device *pdev)
>   
>   	pm_runtime_put_autosuspend(&pdev->dev);
>   
> -	ret = request_threaded_irq(tqspi->irq, NULL,
> -				   tegra_qspi_isr_thread, IRQF_ONESHOT,
> -				   dev_name(&pdev->dev), tqspi);
> +	tqspi->wq = devm_alloc_workqueue(&pdev->dev, "%s",
> +					 WQ_HIGHPRI | WQ_UNBOUND, 0,
> +					 dev_name(&pdev->dev));
> +	if (!tqspi->wq) {
> +		dev_err(&pdev->dev, "failed to allocate workqueue\n");
> +		ret = -ENOMEM;
> +		goto exit_pm_disable;
> +	}
> +
> +	INIT_WORK(&tqspi->irq_work, tegra_qspi_work_handler);
> +
> +	ret = devm_request_irq(&pdev->dev, tqspi->irq, tegra_qspi_isr,
> +			       IRQF_SHARED, dev_name(&pdev->dev), tqspi);
>   	if (ret < 0) {
>   		dev_err(&pdev->dev, "failed to request IRQ#%u: %d\n", tqspi->irq, ret);
>   		goto exit_pm_disable;
> @@ -1804,13 +1847,11 @@ static int tegra_qspi_probe(struct platform_device *pdev)
>   	ret = spi_register_controller(host);
>   	if (ret < 0) {
>   		dev_err(&pdev->dev, "failed to register host: %d\n", ret);
> -		goto exit_free_irq;
> +		goto exit_pm_disable;
>   	}
>   
>   	return 0;
>   
> -exit_free_irq:
> -	free_irq(qspi_irq, tqspi);
>   exit_pm_disable:
>   	pm_runtime_dont_use_autosuspend(&pdev->dev);
>   	pm_runtime_force_suspend(&pdev->dev);
> @@ -1824,7 +1865,6 @@ static void tegra_qspi_remove(struct platform_device *pdev)
>   	struct tegra_qspi *tqspi = spi_controller_get_devdata(host);
>   
>   	spi_unregister_controller(host);
> -	free_irq(tqspi->irq, tqspi);
>   	pm_runtime_dont_use_autosuspend(&pdev->dev);
>   	pm_runtime_force_suspend(&pdev->dev);
>   	tegra_qspi_deinit_dma(tqspi);

Looks like you dropped the ...

	flush_workqueue(tqspi->wq);

Don't we still need this?

Jon
Breno Leitao May 20, 2026, 3:25 p.m. UTC | #2
On Tue, May 19, 2026 at 03:51:06PM +0000, Vishwaroop A wrote:
> Threaded IRQ handlers suffer from scheduler latency on heavily loaded
> systems, causing false transfer timeouts. Convert to hard IRQ handler
> that schedules work on a high-priority unbound workqueue.
> 
> The hard IRQ handler verifies the interrupt, caches FIFO status,
> clears and masks interrupts, then schedules bottom-half processing.
> The workqueue handler runs in process context (can sleep for DMA)
> and can execute on any CPU, avoiding CPU0 bottlenecks.

Thanks for doing this work!

> +	status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
> +	if (!(status & QSPI_RDY))
> +		return IRQ_NONE;
> +
> +	spin_lock(&tqspi->lock);

Can you help me to understand what the tqspi->lock protects? I am still
a bit confused by this lock, but at the first glance, I am wondering if
you don't need to have the lock while reading the status.

Thanks
--breno
Mark Brown May 20, 2026, 3:28 p.m. UTC | #3
On Wed, May 20, 2026 at 10:22:53AM +0100, Jon Hunter wrote:
> 
> 
> On 19/05/2026 16:51, Vishwaroop A wrote:
> > Threaded IRQ handlers suffer from scheduler latency on heavily loaded
> > systems, causing false transfer timeouts. Convert to hard IRQ handler
> > that schedules work on a high-priority unbound workqueue.

Please delete unneeded context from mails when replying.  Doing this
makes it much easier to find your reply in the message, helping ensure
it won't be missed by people scrolling through the irrelevant quoted
material.
Vishwaroop A May 20, 2026, 7:22 p.m. UTC | #4
On Wed, May 20, 2026 at 08:25:23AM -0700, Breno Leitao wrote:
> > +     status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
> > +     if (!(status & QSPI_RDY))
> > +             return IRQ_NONE;
> > +
> > +     spin_lock(&tqspi->lock);
>
> Can you help me to understand what the tqspi->lock protects? I am still
> a bit confused by this lock, but at the first glance, I am wondering if
> you don't need to have the lock while reading the status.

Good question. The QSPI_TRANS_STATUS read before the lock is a hardware
register read (MMIO) used as the IRQF_SHARED ownership check -- we read
QSPI_RDY to determine if this interrupt belongs to us. If not, we
return IRQ_NONE immediately. Taking the lock before this check would
serialize against every unrelated interrupt on the shared line for no
benefit, since we're reading hardware state that no software path can
modify concurrently.

The only CPU-side write to QSPI_TRANS_STATUS is the write-1-to-clear
inside tegra_qspi_mask_clear_irq(), which happens under the lock at
line 1655, after we've already confirmed the interrupt is ours. The
register is also cleared at the start of each new message in
tegra_qspi_setup_transfer_one() -> tegra_qspi_mask_clear_irq(), but
that runs before the transfer is started and interrupts are unmasked,
so there's no overlap with the ISR.

The lock itself protects the software state that is shared between
the ISR, the workqueue bottom-half, and the timeout handler running
in the transfer thread. Specifically, curr_xfer is read and NULLed
by handle_cpu_based_xfer/handle_dma_based_xfer under the lock, and
checked by the timeout handler under the lock, so concurrent access
is serialized. The ISR writes status_reg, tx_status, and rx_status
under the lock before scheduling the workqueue; the workqueue and
transfer thread read them after the scheduling barrier or completion
respectively, so the lock provides the write-side ordering.

The trans_status caching (tqspi->trans_status = status) before the
lock is safe because the ISR is the sole writer in interrupt context.
The only reader is handle_timeout, which reads it under
spin_lock_irqsave after wait_for_completion_timeout expires. The
setup path resets it to zero under the lock before starting each
transfer, well before the ISR can fire.

Vishwaroop
Breno Leitao May 21, 2026, 3:04 p.m. UTC | #5
Hello Vishwaroop,

On Wed, May 20, 2026 at 07:22:10PM +0000, Vishwaroop A wrote:
> On Wed, May 20, 2026 at 08:25:23AM -0700, Breno Leitao wrote:
> > > +     status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
> > > +     if (!(status & QSPI_RDY))
> > > +             return IRQ_NONE;
> > > +
> > > +     spin_lock(&tqspi->lock);
> >
> > Can you help me to understand what the tqspi->lock protects? I am still
> > a bit confused by this lock, but at the first glance, I am wondering if
> > you don't need to have the lock while reading the status.
> 
> The lock itself protects the software state that is shared between
> the ISR, the workqueue bottom-half, and the timeout handler running
> in the transfer thread.

Thanks for the reply!

I got the impression that tqspi->lock is also serializing hardware
accesses — the hard IRQ holds it across the readl(QSPI_FIFO_STATUS) and
across tegra_qspi_mask_clear_irq(), which does an RMW on QSPI_INTR_MASK
and a W1C on QSPI_TRANS_STATUS; the setup, start, error and bottom-half
paths similarly hold it across MMIO.

If the lock is really only guarding curr_xfer / status_reg / tx_status
/ rx_status, why hold it across those register accesses at all?

Thanks,
--breno
Vishwaroop A May 22, 2026, 9:09 a.m. UTC | #6
On Thu, May 21, 2026 at 08:04:08AM -0700, Breno Leitao wrote:
> If the lock is really only guarding curr_xfer / status_reg / tx_status
> / rx_status, why hold it across those register accesses at all?

Thanks for pushing on this, it's a fair question and worth spelling
out more carefully.

The lock spans the register accesses because the FIFO_STATUS read and
the tx_status / rx_status / status_reg writes form one update -- the
software fields are populated directly from that read. The bottom-half
(handle_cpu_based_xfer at line 1467) and the timeout handler (which
calls into handle_cpu_based_xfer at line 1098 after releasing its own
acquisition at line 1092) both read tx_status and rx_status under the
same lock. Splitting the lock so the read is outside would let those
callers observe partially updated software state.

The mask_clear_irq call is inside the same critical section because
it is the W1C that retires the interrupt source the ISR just consumed.
Pairing it with the FIFO_STATUS read and the software writes ensures
that any subsequent acquirer of the lock sees either "interrupt is
pending and software fields are stale" or "interrupt is retired and
software fields are current" -- never the middle state.

handle_cpu_based_xfer holds the lock across its FIFO drain and
sub-transfer restart for the same reason: the restart calls
tegra_qspi_unmask_irq, which can re-trigger the ISR before the
handler finishes setting curr_xfer = NULL at line 1497. Releasing
the lock only after that assignment lets the next ISR's critical
section start from a consistent state.

TRANS_STATUS is read outside the lock because that read precedes the
sequence -- it is the IRQF_SHARED ownership check, a single readl
with no associated software state and no RMW. The two CPU-side paths
that clear it are tegra_qspi_mask_clear_irq inside the ISR's locked
section (line 1655) and tegra_qspi_mask_clear_irq from
setup_transfer_one (line 862), which runs before the controller
unmasks its interrupts, so it cannot overlap with an ISR.

Vishwaroop
diff mbox series

Patch

diff --git a/drivers/spi/spi-tegra210-quad.c b/drivers/spi/spi-tegra210-quad.c
index db28dd556484..17d0b511af1d 100644
--- a/drivers/spi/spi-tegra210-quad.c
+++ b/drivers/spi/spi-tegra210-quad.c
@@ -191,6 +191,8 @@  struct tegra_qspi {
 	void __iomem				*base;
 	phys_addr_t				phys;
 	unsigned int				irq;
+	struct work_struct			irq_work;
+	struct workqueue_struct			*wq;
 
 	u32					cur_speed;
 	unsigned int				cur_pos;
@@ -1225,9 +1227,9 @@  static int tegra_qspi_combined_seq_xfer(struct tegra_qspi *tqspi,
 
 			if (ret == 0) {
 				/*
-				 * Check if hardware completed the transfer
-				 * even though interrupt was lost or delayed.
-				 * If so, process the completion and continue.
+				 * Check if hardware completed the transfer even though
+				 * workqueue was delayed. If so, process completion and
+				 * continue.
 				 */
 				ret = tegra_qspi_handle_timeout(tqspi);
 				if (ret < 0) {
@@ -1344,8 +1346,8 @@  static int tegra_qspi_non_combined_seq_xfer(struct tegra_qspi *tqspi,
 		if (ret == 0) {
 			/*
 			 * Check if hardware completed the transfer even though
-			 * interrupt was lost or delayed. If so, process the
-			 * completion and continue.
+			 * workqueue was delayed. If so, process completion and
+			 * continue.
 			 */
 			ret = tegra_qspi_handle_timeout(tqspi);
 			if (ret < 0) {
@@ -1574,46 +1576,40 @@  static irqreturn_t handle_dma_based_xfer(struct tegra_qspi *tqspi)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t tegra_qspi_isr_thread(int irq, void *context_data)
+/**
+ * tegra_qspi_work_handler - Workqueue handler for interrupt bottom-half
+ * @work: work_struct embedded in tegra_qspi
+ *
+ * Runs in process context and can sleep (needed for DMA completion waits).
+ * Can run on any available CPU, avoiding CPU0 bottleneck that occurs with
+ * threaded IRQ handlers which are pinned to the IRQ's CPU.
+ *
+ * The hard IRQ handler has already:
+ * - Verified this is our interrupt (QSPI_RDY was set)
+ * - Cached FIFO status in tqspi->status_reg
+ * - Parsed tx_status / rx_status from FIFO status
+ * - Masked further interrupts
+ */
+static void tegra_qspi_work_handler(struct work_struct *work)
 {
-	struct tegra_qspi *tqspi = context_data;
+	struct tegra_qspi *tqspi = container_of(work, struct tegra_qspi, irq_work);
 	unsigned long flags;
-	u32 status;
 
-	/*
-	 * Read transfer status to check if interrupt was triggered by transfer
-	 * completion
-	 */
-	status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
+	spin_lock_irqsave(&tqspi->lock, flags);
 
 	/*
-	 * Occasionally the IRQ thread takes a long time to wake up (usually
-	 * when the CPU that it's running on is excessively busy) and we have
-	 * already reached the timeout before and cleaned up the timed out
-	 * transfer. Avoid any processing in that case and bail out early.
-	 *
-	 * If no transfer is in progress, check if this was a real interrupt
-	 * that the timeout handler already processed, or a spurious one.
+	 * Check if timeout handler already processed this transfer.
+	 * Can happen if work was delayed and timeout fired first. If
+	 * so, we must unmask interrupts before returning, otherwise
+	 * they remain masked from the hard IRQ handler and the next
+	 * transfer will timeout.
 	 */
-	spin_lock_irqsave(&tqspi->lock, flags);
 	if (!tqspi->curr_xfer) {
 		spin_unlock_irqrestore(&tqspi->lock, flags);
-		/* Spurious interrupt - transfer not ready */
-		if (!(status & QSPI_RDY))
-			return IRQ_NONE;
-		/* Real interrupt, already handled by timeout path */
-		return IRQ_HANDLED;
+		tegra_qspi_unmask_irq(tqspi);
+		return;
 	}
 
-	tqspi->status_reg = tegra_qspi_readl(tqspi, QSPI_FIFO_STATUS);
-
-	if (tqspi->cur_direction & DATA_DIR_TX)
-		tqspi->tx_status = tqspi->status_reg & (QSPI_TX_FIFO_UNF | QSPI_TX_FIFO_OVF);
-
-	if (tqspi->cur_direction & DATA_DIR_RX)
-		tqspi->rx_status = tqspi->status_reg & (QSPI_RX_FIFO_OVF | QSPI_RX_FIFO_UNF);
-
-	tegra_qspi_mask_clear_irq(tqspi);
 	spin_unlock_irqrestore(&tqspi->lock, flags);
 
 	/*
@@ -1623,9 +1619,46 @@  static irqreturn_t tegra_qspi_isr_thread(int irq, void *context_data)
 	 * cannot be done while holding spinlock.
 	 */
 	if (!tqspi->is_curr_dma_xfer)
-		return handle_cpu_based_xfer(tqspi);
+		handle_cpu_based_xfer(tqspi);
+	else
+		handle_dma_based_xfer(tqspi);
+}
+
+/**
+ * tegra_qspi_isr - Hard IRQ handler
+ * @irq: IRQ number
+ * @context_data: QSPI controller instance
+ *
+ * Runs in hard IRQ context with minimal latency. Cannot sleep.
+ *
+ * Return: IRQ_NONE if not our interrupt, IRQ_HANDLED if handled
+ */
+static irqreturn_t tegra_qspi_isr(int irq, void *context_data)
+{
+	struct tegra_qspi *tqspi = context_data;
+	u32 status;
+
+	status = tegra_qspi_readl(tqspi, QSPI_TRANS_STATUS);
+	if (!(status & QSPI_RDY))
+		return IRQ_NONE;
+
+	spin_lock(&tqspi->lock);
+	tqspi->status_reg = tegra_qspi_readl(tqspi, QSPI_FIFO_STATUS);
+	tegra_qspi_mask_clear_irq(tqspi);
 
-	return handle_dma_based_xfer(tqspi);
+	if (tqspi->cur_direction & DATA_DIR_TX)
+		tqspi->tx_status = tqspi->status_reg &
+				    (QSPI_TX_FIFO_UNF | QSPI_TX_FIFO_OVF);
+
+	if (tqspi->cur_direction & DATA_DIR_RX)
+		tqspi->rx_status = tqspi->status_reg &
+				    (QSPI_RX_FIFO_OVF | QSPI_RX_FIFO_UNF);
+
+	spin_unlock(&tqspi->lock);
+
+	queue_work(tqspi->wq, &tqspi->irq_work);
+
+	return IRQ_HANDLED;
 }
 
 static struct tegra_qspi_soc_data tegra210_qspi_soc_data = {
@@ -1793,9 +1826,19 @@  static int tegra_qspi_probe(struct platform_device *pdev)
 
 	pm_runtime_put_autosuspend(&pdev->dev);
 
-	ret = request_threaded_irq(tqspi->irq, NULL,
-				   tegra_qspi_isr_thread, IRQF_ONESHOT,
-				   dev_name(&pdev->dev), tqspi);
+	tqspi->wq = devm_alloc_workqueue(&pdev->dev, "%s",
+					 WQ_HIGHPRI | WQ_UNBOUND, 0,
+					 dev_name(&pdev->dev));
+	if (!tqspi->wq) {
+		dev_err(&pdev->dev, "failed to allocate workqueue\n");
+		ret = -ENOMEM;
+		goto exit_pm_disable;
+	}
+
+	INIT_WORK(&tqspi->irq_work, tegra_qspi_work_handler);
+
+	ret = devm_request_irq(&pdev->dev, tqspi->irq, tegra_qspi_isr,
+			       IRQF_SHARED, dev_name(&pdev->dev), tqspi);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to request IRQ#%u: %d\n", tqspi->irq, ret);
 		goto exit_pm_disable;
@@ -1804,13 +1847,11 @@  static int tegra_qspi_probe(struct platform_device *pdev)
 	ret = spi_register_controller(host);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to register host: %d\n", ret);
-		goto exit_free_irq;
+		goto exit_pm_disable;
 	}
 
 	return 0;
 
-exit_free_irq:
-	free_irq(qspi_irq, tqspi);
 exit_pm_disable:
 	pm_runtime_dont_use_autosuspend(&pdev->dev);
 	pm_runtime_force_suspend(&pdev->dev);
@@ -1824,7 +1865,6 @@  static void tegra_qspi_remove(struct platform_device *pdev)
 	struct tegra_qspi *tqspi = spi_controller_get_devdata(host);
 
 	spi_unregister_controller(host);
-	free_irq(tqspi->irq, tqspi);
 	pm_runtime_dont_use_autosuspend(&pdev->dev);
 	pm_runtime_force_suspend(&pdev->dev);
 	tegra_qspi_deinit_dma(tqspi);