rasdaemon/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch

139 lines
3.8 KiB
Diff
Raw Normal View History

2022-03-28 16:19:44 +08:00
From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001
From: Shengwei Luo <luoshengwei@huawei.com>
Date: Wed, 23 Feb 2022 17:23:27 +0800
Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors
When the recoverable errors in cpu core occurred, try to offline
the related cpu core.
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
---
ras-arm-handler.c | 21 ++++++++++++++++++---
ras-cpu-isolation.c | 17 +++++++++++++++++
ras-cpu-isolation.h | 4 +++-
3 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index c9ef2fd..dae5ad6 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s,
}
#ifdef HAVE_CPU_FAULT_ISOLATION
-static int count_errors(struct ras_arm_event *ev)
+static int is_core_failure(struct ras_arm_err_info *err_info)
+{
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
+ /*
+ * core failure:
+ * Bit 0\1\3: (at lease 1)
+ * Bit 2: 0
+ */
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2));
+ }
+ return 0;
+}
+
+static int count_errors(struct ras_arm_event *ev, int sev)
{
struct ras_arm_err_info *err_info;
int num_pei;
@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev)
*/
error_count = err_info->multiple_error + 1;
}
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
+ error_count = 0;
num += error_count;
err_info += 1;
@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s,
}
trace_seq_printf(s, "\n severity: %s", severity);
- if (val == GHES_SEV_CORRECTED) {
- nums = count_errors(&ev);
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
+ nums = count_errors(&ev, val);
if (nums > 0) {
err_info.nums = nums;
err_info.time = now;
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index 8c0cdf9..e650022 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus)
for (unsigned int i = 0; i < cpus; ++i) {
cpu_infos[i].ce_nums = 0;
+ cpu_infos[i].uce_nums = 0;
cpu_infos[i].state = get_cpu_status(i);
cpu_infos[i].ce_queue = init_queue();
@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu)
return HANDLE_NOTHING;
}
+static int do_uce_handler(unsigned int cpu)
+{
+ if (cpu_infos[cpu].uce_nums > 0) {
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu);
+ return do_cpu_offline(cpu);
+ }
+ return HANDLE_NOTHING;
+}
+
static int error_handler(unsigned int cpu, struct error_info *err_info)
{
int ret = HANDLE_NOTHING;
@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
case CE:
ret = do_ce_handler(cpu);
break;
+ case UCE:
+ ret = do_uce_handler(cpu);
+ break;
default:
break;
}
@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
cpu_infos[cpu].ce_nums += err_info->nums;
break;
}
+ case UCE:
+ cpu_infos[cpu].uce_nums++;
+ break;
default:
break;
}
@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
cpu, cpu_state[cpu_infos[cpu].state]);
clear_queue(cpu_infos[cpu].ce_queue);
cpu_infos[cpu].ce_nums = 0;
+ cpu_infos[cpu].uce_nums = 0;
} else
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
cpu, cpu_state[cpu_infos[cpu].state]);
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
index 1159853..024a68b 100644
--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
@@ -46,10 +46,12 @@ enum error_handle_result {
};
enum error_type {
- CE = 1
+ CE = 1,
+ UCE
};
struct cpu_info {
+ unsigned long uce_nums;
unsigned long ce_nums;
struct link_queue *ce_queue;
enum cpu_state state;
--
2.27.0