2022-05-25 17:15:14 +01:00
|
|
|
From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001
|
2022-03-28 16:19:44 +08:00
|
|
|
From: Shengwei Luo <luoshengwei@huawei.com>
|
|
|
|
|
Date: Wed, 23 Feb 2022 17:23:27 +0800
|
2022-05-25 17:15:14 +01:00
|
|
|
Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable
|
|
|
|
|
errors
|
2022-03-28 16:19:44 +08:00
|
|
|
|
|
|
|
|
When the recoverable errors in cpu core occurred, try to offline
|
|
|
|
|
the related cpu core.
|
|
|
|
|
|
|
|
|
|
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
|
2022-05-25 17:15:14 +01:00
|
|
|
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
|
|
|
|
|
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
|
|
|
|
|
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
2022-03-28 16:19:44 +08:00
|
|
|
---
|
2022-05-25 17:15:14 +01:00
|
|
|
ras-arm-handler.c | 22 +++++++++++++++++++---
|
2022-03-28 16:19:44 +08:00
|
|
|
ras-cpu-isolation.c | 17 +++++++++++++++++
|
|
|
|
|
ras-cpu-isolation.h | 4 +++-
|
2022-05-25 17:15:14 +01:00
|
|
|
3 files changed, 39 insertions(+), 4 deletions(-)
|
2022-03-28 16:19:44 +08:00
|
|
|
|
|
|
|
|
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
2022-05-25 17:15:14 +01:00
|
|
|
index 9c7a3c3..a0dfc51 100644
|
2022-03-28 16:19:44 +08:00
|
|
|
--- a/ras-arm-handler.c
|
|
|
|
|
+++ b/ras-arm-handler.c
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -26,6 +26,7 @@
|
|
|
|
|
|
|
|
|
|
#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
|
|
|
|
|
#define ARM_ERR_VALID_FLAGS BIT(1)
|
|
|
|
|
+#define BIT2 2
|
|
|
|
|
|
|
|
|
|
void display_raw_data(struct trace_seq *s,
|
|
|
|
|
const uint8_t *buf,
|
|
|
|
|
@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s,
|
2022-03-28 16:19:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_CPU_FAULT_ISOLATION
|
|
|
|
|
-static int count_errors(struct ras_arm_event *ev)
|
|
|
|
|
+static int is_core_failure(struct ras_arm_err_info *err_info)
|
|
|
|
|
+{
|
|
|
|
|
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
|
|
|
|
|
+ /*
|
|
|
|
|
+ * core failure:
|
|
|
|
|
+ * Bit 0\1\3: (at lease 1)
|
|
|
|
|
+ * Bit 2: 0
|
|
|
|
|
+ */
|
2022-05-25 17:15:14 +01:00
|
|
|
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2));
|
2022-03-28 16:19:44 +08:00
|
|
|
+ }
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static int count_errors(struct ras_arm_event *ev, int sev)
|
|
|
|
|
{
|
|
|
|
|
struct ras_arm_err_info *err_info;
|
|
|
|
|
int num_pei;
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev)
|
2022-03-28 16:19:44 +08:00
|
|
|
*/
|
|
|
|
|
error_count = err_info->multiple_error + 1;
|
|
|
|
|
}
|
|
|
|
|
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
|
|
|
|
|
+ error_count = 0;
|
|
|
|
|
|
|
|
|
|
num += error_count;
|
|
|
|
|
err_info += 1;
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s,
|
2022-03-28 16:19:44 +08:00
|
|
|
}
|
|
|
|
|
trace_seq_printf(s, "\n severity: %s", severity);
|
|
|
|
|
|
|
|
|
|
- if (val == GHES_SEV_CORRECTED) {
|
2022-05-25 17:15:14 +01:00
|
|
|
- int nums = count_errors(ev);
|
2022-03-28 16:19:44 +08:00
|
|
|
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
|
2022-05-25 17:15:14 +01:00
|
|
|
+ int nums = count_errors(ev, val);
|
|
|
|
|
|
2022-03-28 16:19:44 +08:00
|
|
|
if (nums > 0) {
|
|
|
|
|
err_info.nums = nums;
|
|
|
|
|
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
2022-05-25 17:15:14 +01:00
|
|
|
index abcf451..fd23e4e 100644
|
2022-03-28 16:19:44 +08:00
|
|
|
--- a/ras-cpu-isolation.c
|
|
|
|
|
+++ b/ras-cpu-isolation.c
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus)
|
2022-03-28 16:19:44 +08:00
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < cpus; ++i) {
|
|
|
|
|
cpu_infos[i].ce_nums = 0;
|
|
|
|
|
+ cpu_infos[i].uce_nums = 0;
|
|
|
|
|
cpu_infos[i].state = get_cpu_status(i);
|
|
|
|
|
cpu_infos[i].ce_queue = init_queue();
|
|
|
|
|
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu)
|
2022-03-28 16:19:44 +08:00
|
|
|
return HANDLE_NOTHING;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
+static int do_uce_handler(unsigned int cpu)
|
|
|
|
|
+{
|
|
|
|
|
+ if (cpu_infos[cpu].uce_nums > 0) {
|
2022-05-25 17:15:14 +01:00
|
|
|
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu);
|
2022-03-28 16:19:44 +08:00
|
|
|
+ return do_cpu_offline(cpu);
|
|
|
|
|
+ }
|
|
|
|
|
+ return HANDLE_NOTHING;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
static int error_handler(unsigned int cpu, struct error_info *err_info)
|
|
|
|
|
{
|
|
|
|
|
int ret = HANDLE_NOTHING;
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
|
2022-03-28 16:19:44 +08:00
|
|
|
case CE:
|
|
|
|
|
ret = do_ce_handler(cpu);
|
|
|
|
|
break;
|
|
|
|
|
+ case UCE:
|
|
|
|
|
+ ret = do_uce_handler(cpu);
|
|
|
|
|
+ break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
|
2022-03-28 16:19:44 +08:00
|
|
|
cpu_infos[cpu].ce_nums += err_info->nums;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
+ case UCE:
|
|
|
|
|
+ cpu_infos[cpu].uce_nums++;
|
|
|
|
|
+ break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
2022-05-25 17:15:14 +01:00
|
|
|
@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
2022-03-28 16:19:44 +08:00
|
|
|
cpu, cpu_state[cpu_infos[cpu].state]);
|
|
|
|
|
clear_queue(cpu_infos[cpu].ce_queue);
|
|
|
|
|
cpu_infos[cpu].ce_nums = 0;
|
|
|
|
|
+ cpu_infos[cpu].uce_nums = 0;
|
|
|
|
|
} else
|
|
|
|
|
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
|
|
|
|
|
cpu, cpu_state[cpu_infos[cpu].state]);
|
|
|
|
|
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
|
|
|
|
index 1159853..024a68b 100644
|
|
|
|
|
--- a/ras-cpu-isolation.h
|
|
|
|
|
+++ b/ras-cpu-isolation.h
|
|
|
|
|
@@ -46,10 +46,12 @@ enum error_handle_result {
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum error_type {
|
|
|
|
|
- CE = 1
|
|
|
|
|
+ CE = 1,
|
|
|
|
|
+ UCE
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct cpu_info {
|
|
|
|
|
+ unsigned long uce_nums;
|
|
|
|
|
unsigned long ce_nums;
|
|
|
|
|
struct link_queue *ce_queue;
|
|
|
|
|
enum cpu_state state;
|
|
|
|
|
--
|
2022-05-25 17:15:14 +01:00
|
|
|
2.25.1
|
2022-03-28 16:19:44 +08:00
|
|
|
|