[PATCH 5/6] drm/msm/a6xx: Fix IRQ storm during msm_recovery test

From: Akhil P Oommen

Date: Thu Jun 04 2026 - 16:11:39 EST


From: Jie Zhang <jie.zhang@xxxxxxxxxxxxxxxx>

Once a hang is triggered by the msm_recovery test, the gpu error irq
remains asserted and triggers an interrupt storm. In the worst case,
this IRQ storm lands on the CPU core where the hangcheck timer is
scheduled, blocking it from running. This eventually leads to CPU
watchdog timeouts.

To fix this, mask the gpu error irqs during msm_recovery test and
enable them back during the recovery.

Fixes: 5edf2750d998 ("drm/msm: Add debugfs to disable hw err handling")
Signed-off-by: Jie Zhang <jie.zhang@xxxxxxxxxxxxxxxx>
Signed-off-by: Akhil P Oommen <akhilpo@xxxxxxxxxxxxxxxx>
---
drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 5 +++++
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 5 ++++-
drivers/gpu/drm/msm/adreno/a8xx_gpu.c | 5 ++++-
drivers/gpu/drm/msm/msm_gpu.c | 2 ++
4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 2c0bbac43c52..f1df2514c613 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1275,6 +1275,11 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);

if (priv->disable_err_irq) {
+ /* Turn off interrupts to avoid interrupt storm */
+ gpu_write(gpu, REG_A5XX_RBBM_INT_0_MASK,
+ A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
+ A5XX_RBBM_INT_0_MASK_CP_SW);
+
status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
A5XX_RBBM_INT_0_MASK_CP_SW;
}
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 8b3bb2fd433b..9a4f9d0e1780 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1911,8 +1911,11 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)

gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);

- if (priv->disable_err_irq)
+ if (priv->disable_err_irq) {
+ /* Turn off interrupts to avoid interrupt storm */
+ gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS);
status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
+ }

if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
a6xx_fault_detect_irq(gpu);
diff --git a/drivers/gpu/drm/msm/adreno/a8xx_gpu.c b/drivers/gpu/drm/msm/adreno/a8xx_gpu.c
index 9e44fd1ae634..0f6fd35bd587 100644
--- a/drivers/gpu/drm/msm/adreno/a8xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a8xx_gpu.c
@@ -1211,8 +1211,11 @@ irqreturn_t a8xx_irq(struct msm_gpu *gpu)

gpu_write(gpu, REG_A8XX_RBBM_INT_CLEAR_CMD, status);

- if (priv->disable_err_irq)
+ if (priv->disable_err_irq) {
+ /* Turn off interrupts to avoid interrupt storm */
+ gpu_write(gpu, REG_A8XX_RBBM_INT_0_MASK, A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS);
status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
+ }

if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
a8xx_fault_detect_irq(gpu);
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 9ac7740a87f0..48ac51f4119b 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -552,6 +552,8 @@ static void recover_worker(struct kthread_work *work)
msm_update_fence(ring->fctx, fence);
}

+ priv->disable_err_irq = false;
+
gpu->funcs->recover(gpu);

/* retire completed submits, plus the one that hung: */

--
2.51.0