[PATCH v7 06/16] arm64: ras: Enable error reporting
From: Ruidong Tian
Date: Tue Jun 02 2026 - 03:17:04 EST
Until now, AEST nodes were registered but never had interrupt routing
programmed, so probe finished without actually arming any error source.
Both shared and PE (oncore) nodes need this bring-up; the difference
is only that oncore state is percpu and must follow CPU online/offline.
Signed-off-by: Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx>
---
arch/arm64/include/asm/ras.h | 10 +++
drivers/acpi/arm64/aest.c | 4 +-
drivers/ras/arm64/ras-core.c | 159 ++++++++++++++++++++++++++++++++++-
drivers/ras/arm64/ras.h | 32 +++++++
include/linux/cpuhotplug.h | 1 +
5 files changed, 204 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
index b6640b9972bf..a992610d7755 100644
--- a/arch/arm64/include/asm/ras.h
+++ b/arch/arm64/include/asm/ras.h
@@ -2,8 +2,18 @@
#ifndef __ASM_RAS_H
#define __ASM_RAS_H
+#include <linux/bits.h>
#include <linux/types.h>
+/* ERR<n>CTLR */
+#define ERR_CTLR_CFI BIT(8)
+#define ERR_CTLR_FI BIT(3)
+#define ERR_CTLR_UI BIT(2)
+
+/* ERRIRQCR<n> */
+#define ERRFHICR0_OFFSET 0x0
+#define ERRERICR0_OFFSET 0x10
+
struct ras_ext_regs {
u64 err_fr;
u64 err_ctlr;
diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c
index af03f4365cfa..5733c91c8e0d 100644
--- a/drivers/acpi/arm64/aest.c
+++ b/drivers/acpi/arm64/aest.c
@@ -157,6 +157,8 @@ aest_init_node_props(struct acpi_aest_hdr *hdr, struct property_entry *props,
common->error_group_register_base);
props[(*p)++] = PROPERTY_ENTRY_U64("arm,fault-inject-base",
common->fault_inject_register_base);
+ props[(*p)++] = PROPERTY_ENTRY_U64("arm,interrupt-config-base",
+ common->interrupt_config_register_base);
props[(*p)++] = PROPERTY_ENTRY_U32("arm,fhi-gsiv", fhi_gsiv);
props[(*p)++] = PROPERTY_ENTRY_U32("arm,eri-gsiv", eri_gsiv);
@@ -171,7 +173,7 @@ aest_init_node_props(struct acpi_aest_hdr *hdr, struct property_entry *props,
static int __init
aest_create_node_fwnode(struct acpi_aest_hdr *hdr, struct platform_device *pdev)
{
- struct property_entry props[14] = { };
+ struct property_entry props[15] = { };
int p = 0;
int ret;
diff --git a/drivers/ras/arm64/ras-core.c b/drivers/ras/arm64/ras-core.c
index 9520415df8cb..98f274b9731d 100644
--- a/drivers/ras/arm64/ras-core.c
+++ b/drivers/ras/arm64/ras-core.c
@@ -5,6 +5,7 @@
* Copyright (c) 2025, Alibaba Group.
*/
+#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/platform_device.h>
@@ -45,6 +46,21 @@ const struct ras_group ras_group_config[] = {
},
};
+static void ras_node_foreach_record(void (*func)(struct ras_record *, void *),
+ struct ras_node *node, void *data,
+ unsigned long *bitmap)
+{
+ int i;
+
+ for_each_clear_bit(i, bitmap, node->record_count) {
+ ras_select_record(node, i);
+
+ func(&node->records[i], data);
+
+ ras_sync(node);
+ }
+ }
+
static irqreturn_t ras_irq_func(int irq, void *input)
{
struct ras_node *node = input;
@@ -52,6 +68,23 @@ static irqreturn_t ras_irq_func(int irq, void *input)
return IRQ_HANDLED;
}
+static void ras_config_irq(struct ras_node *node)
+{
+ u32 fhi_gsi, eri_gsi;
+
+ if (!node->irq_config)
+ return;
+
+ if (!device_property_read_u32(node->dev, "arm,fhi-gsiv", &fhi_gsi))
+ writeq_relaxed(fhi_gsi, node->irq_config + ERRFHICR0_OFFSET);
+
+ if (!device_property_read_u32(node->dev, "arm,eri-gsiv", &eri_gsi))
+ writeq_relaxed(eri_gsi, node->irq_config + ERRERICR0_OFFSET);
+
+ ras_node_dbg(node, "config irq fhi_gsi %u eri_gsi %u at %pK",
+ fhi_gsi, eri_gsi, node->irq_config);
+}
+
static int ras_register_irq(struct ras_node *node)
{
int i, irq, ret;
@@ -94,6 +127,21 @@ static int ras_register_irq(struct ras_node *node)
return ret;
}
+static void ras_enable_irq(struct ras_record *record)
+{
+ struct ras_node *node = record->node;
+ u64 err_ctlr;
+
+ err_ctlr = record_read(record, ERXCTLR);
+
+ if (node->irq[0])
+ err_ctlr |= (ERR_CTLR_FI | ERR_CTLR_CFI);
+ if (node->irq[1])
+ err_ctlr |= ERR_CTLR_UI;
+
+ record_write(record, ERXCTLR, err_ctlr);
+}
+
static int ras_init_record(struct ras_record *record, int i, struct ras_node *node)
{
record->name = devm_kasprintf(node->dev, GFP_KERNEL, "record%d", i);
@@ -110,6 +158,85 @@ static int ras_init_record(struct ras_record *record, int i, struct ras_node *no
return 0;
}
+static void ras_online_record(struct ras_record *record, void *data)
+{
+ ras_enable_irq(record);
+}
+
+static void ras_online_node(struct ras_node *node)
+{
+ if (!node->name)
+ return;
+
+ ras_config_irq(node);
+
+ ras_node_foreach_record(ras_online_record, node, NULL,
+ node->record_implemented);
+}
+
+static void ras_online_oncore_dev(void *data)
+{
+ int fhi_irq, eri_irq;
+ struct ras_node *node = this_cpu_ptr(data);
+
+ ras_online_node(node);
+
+ fhi_irq = node->irq[ACPI_AEST_NODE_FAULT_HANDLING];
+ if (fhi_irq > 0)
+ enable_percpu_irq(fhi_irq, IRQ_TYPE_NONE);
+ eri_irq = node->irq[ACPI_AEST_NODE_ERROR_RECOVERY];
+ if (eri_irq > 0)
+ enable_percpu_irq(eri_irq, IRQ_TYPE_NONE);
+}
+
+static void ras_offline_oncore_dev(void *data)
+{
+ int fhi_irq, eri_irq;
+ struct ras_node *node = this_cpu_ptr(data);
+
+ fhi_irq = node->irq[ACPI_AEST_NODE_FAULT_HANDLING];
+ if (fhi_irq > 0)
+ disable_percpu_irq(fhi_irq);
+ eri_irq = node->irq[ACPI_AEST_NODE_ERROR_RECOVERY];
+ if (eri_irq > 0)
+ disable_percpu_irq(eri_irq);
+}
+
+static int ras_starting_cpu(unsigned int cpu)
+{
+ pr_debug("CPU%d starting\n", cpu);
+ ras_online_oncore_dev(&percpu_ras_node);
+
+ return 0;
+}
+
+static int ras_dying_cpu(unsigned int cpu)
+{
+ pr_debug("CPU%d dying\n", cpu);
+ ras_offline_oncore_dev(&percpu_ras_node);
+
+ return 0;
+}
+
+static void arm64_ras_remove(struct platform_device *pdev)
+{
+ struct ras_node *node = platform_get_drvdata(pdev);
+ int i;
+
+ platform_set_drvdata(pdev, NULL);
+
+ if (node->type != ACPI_AEST_PROCESSOR_ERROR_NODE)
+ return;
+
+ cpuhp_remove_state(CPUHP_AP_ARM_RAS_STARTING);
+ on_each_cpu(ras_offline_oncore_dev, node->oncore_node, 1);
+
+ for (i = 0; i < AEST_MAX_INTERRUPT_PER_NODE; i++) {
+ if (node->irq[i])
+ free_percpu_irq(node->irq[i], node->oncore_node);
+ }
+}
+
static char *alloc_ras_node_name(struct ras_node *node)
{
char *name;
@@ -208,6 +335,23 @@ static int ras_node_set_inj_base(struct ras_node *node, phys_addr_t base)
return 0;
}
+static int ras_node_set_irq_base(struct ras_node *node, phys_addr_t base)
+{
+ phys_addr_t irq_base;
+ int ret;
+
+ if (!(node->flags & AEST_XFACE_FLAG_INT_CONFIG))
+ return 0;
+
+ ret = device_property_read_u64(node->dev, "arm,interrupt-config-base",
+ &irq_base);
+ if (ret || !irq_base)
+ return 0;
+
+ node->irq_config = irq_base - base + node->base;
+ return 0;
+}
+
static struct ras_node *ras_init_node(struct platform_device *pdev)
{
int i, ret = 0;
@@ -276,6 +420,9 @@ static struct ras_node *ras_init_node(struct platform_device *pdev)
ret = ras_node_set_inj_base(node, mem->start);
if (ret)
return ERR_PTR(ret);
+ ret = ras_node_set_irq_base(node, mem->start);
+ if (ret)
+ return ERR_PTR(ret);
} else if (node->access_type == ACPI_AEST_NODE_MEMORY_MAPPED) {
return ERR_PTR(-EINVAL);
}
@@ -372,6 +519,15 @@ static int arm64_ras_probe(struct platform_device *pdev)
return ret;
}
+ if (ras_node_is_oncore(node))
+ ret = cpuhp_setup_state(CPUHP_AP_ARM_RAS_STARTING,
+ "drivers/ras/arm64/ras:starting",
+ ras_starting_cpu, ras_dying_cpu);
+ else
+ ras_online_node(node);
+ if (ret)
+ return ret;
+
platform_set_drvdata(pdev, node);
return 0;
@@ -381,7 +537,8 @@ static struct platform_driver arm64_ras_driver = {
.driver = {
.name = "arm64_ras",
},
- .probe = arm64_ras_probe,
+ .probe = arm64_ras_probe,
+ .remove = arm64_ras_remove,
};
static int __init arm64_ras_init(void)
diff --git a/drivers/ras/arm64/ras.h b/drivers/ras/arm64/ras.h
index b64eae59b6ac..c26a0aae26c5 100644
--- a/drivers/ras/arm64/ras.h
+++ b/drivers/ras/arm64/ras.h
@@ -90,6 +90,7 @@ struct ras_node {
void __iomem *base;
void __iomem *errgsr;
void __iomem *inj;
+ void __iomem *irq_config;
phys_addr_t addr;
u8 *specific_data;
@@ -126,6 +127,7 @@ struct ras_node {
u8 access_type;
u8 group_format;
u32 irq[AEST_MAX_INTERRUPT_PER_NODE];
+ u32 gsi[AEST_MAX_INTERRUPT_PER_NODE];
};
#define CASE_READ(res, x) \
@@ -207,4 +209,34 @@ static const struct ras_access ras_access[] = {
},
};
+static inline bool ras_node_is_oncore(struct ras_node *node)
+{
+ /*
+ * A processor node is "on-core" (uses PPI + cpuhp) only when its
+ * interrupt is a per-CPU PPI. A shared processor node (e.g. cluster
+ * L3 cache, DSU) uses an SPI and must follow the non-oncore path
+ * (aest_online_dev) so that aest_config_irq and aest_online_dev are
+ * called instead of cpuhp_setup_state.
+ */
+ if (node->type != ACPI_AEST_PROCESSOR_ERROR_NODE)
+ return false;
+ return irq_is_percpu(node->irq[ACPI_AEST_NODE_FAULT_HANDLING]) ||
+ irq_is_percpu(node->irq[ACPI_AEST_NODE_ERROR_RECOVERY]);
+}
+
+static inline void ras_select_record(struct ras_node *node, int index)
+{
+ if (node->type == ACPI_AEST_PROCESSOR_ERROR_NODE) {
+ write_sysreg_s(index, SYS_ERRSELR_EL1);
+ isb();
+ }
+}
+
+/* Ensure all writes has taken effect. */
+static inline void ras_sync(struct ras_node *node)
+{
+ if (node->type == ACPI_AEST_PROCESSOR_ERROR_NODE)
+ isb();
+}
+
#endif /* _DRIVERS_RAS_ARM64_RAS_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 62cd7b35a29c..ef55c10f6c71 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -179,6 +179,7 @@ enum cpuhp_state {
CPUHP_AP_HYPERV_TIMER_STARTING,
/* Must be the last timer callback */
CPUHP_AP_DUMMY_TIMER_STARTING,
+ CPUHP_AP_ARM_RAS_STARTING,
CPUHP_AP_ARM_XEN_STARTING,
CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
CPUHP_AP_ARM_CORESIGHT_STARTING,
--
2.51.2.612.gdc70283dfc