rasdaemon/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch
Shiju Jose f9eb9d8c57 rasdaemon: Update with the latest patches for the CPU fault isolation, Hisilicon Kunpeng9xx common error records and improvements in the ras-mc-ctl for the Hisilicon Kunpeng9xx errors
Update with the latest patches for the
1. CPU online fault isolation for arm event.
2. Modify recording Hisilicon common error data in the rasdaemon
3. In the ras-mc-ctl,
3.1. Improve Hisilicon common error statistics.
3.2. Add support to display the HiSilicon vendor-errors for a specified module.
3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
3.4. Reformat error info of the HiSilicon Kunpeng920.
3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
3.6. Updated the HiSilicon platform name as KunPeng9xx.
4. Fixed a memory out-of-bounds issue in the rasdaemon.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
2022-05-30 09:30:03 +01:00

151 lines
4.1 KiB
Diff

From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001
From: Shengwei Luo <luoshengwei@huawei.com>
Date: Wed, 23 Feb 2022 17:23:27 +0800
Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable
errors
When the recoverable errors in cpu core occurred, try to offline
the related cpu core.
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
ras-arm-handler.c | 22 +++++++++++++++++++---
ras-cpu-isolation.c | 17 +++++++++++++++++
ras-cpu-isolation.h | 4 +++-
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 9c7a3c3..a0dfc51 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -26,6 +26,7 @@
#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
#define ARM_ERR_VALID_FLAGS BIT(1)
+#define BIT2 2
void display_raw_data(struct trace_seq *s,
const uint8_t *buf,
@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s,
}
#ifdef HAVE_CPU_FAULT_ISOLATION
-static int count_errors(struct ras_arm_event *ev)
+static int is_core_failure(struct ras_arm_err_info *err_info)
+{
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
+ /*
+ * core failure:
+ * Bit 0\1\3: (at lease 1)
+ * Bit 2: 0
+ */
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2));
+ }
+ return 0;
+}
+
+static int count_errors(struct ras_arm_event *ev, int sev)
{
struct ras_arm_err_info *err_info;
int num_pei;
@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev)
*/
error_count = err_info->multiple_error + 1;
}
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
+ error_count = 0;
num += error_count;
err_info += 1;
@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s,
}
trace_seq_printf(s, "\n severity: %s", severity);
- if (val == GHES_SEV_CORRECTED) {
- int nums = count_errors(ev);
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
+ int nums = count_errors(ev, val);
if (nums > 0) {
err_info.nums = nums;
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index abcf451..fd23e4e 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus)
for (unsigned int i = 0; i < cpus; ++i) {
cpu_infos[i].ce_nums = 0;
+ cpu_infos[i].uce_nums = 0;
cpu_infos[i].state = get_cpu_status(i);
cpu_infos[i].ce_queue = init_queue();
@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu)
return HANDLE_NOTHING;
}
+static int do_uce_handler(unsigned int cpu)
+{
+ if (cpu_infos[cpu].uce_nums > 0) {
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu);
+ return do_cpu_offline(cpu);
+ }
+ return HANDLE_NOTHING;
+}
+
static int error_handler(unsigned int cpu, struct error_info *err_info)
{
int ret = HANDLE_NOTHING;
@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
case CE:
ret = do_ce_handler(cpu);
break;
+ case UCE:
+ ret = do_uce_handler(cpu);
+ break;
default:
break;
}
@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
cpu_infos[cpu].ce_nums += err_info->nums;
break;
}
+ case UCE:
+ cpu_infos[cpu].uce_nums++;
+ break;
default:
break;
}
@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
cpu, cpu_state[cpu_infos[cpu].state]);
clear_queue(cpu_infos[cpu].ce_queue);
cpu_infos[cpu].ce_nums = 0;
+ cpu_infos[cpu].uce_nums = 0;
} else
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
cpu, cpu_state[cpu_infos[cpu].state]);
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
index 1159853..024a68b 100644
--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
@@ -46,10 +46,12 @@ enum error_handle_result {
};
enum error_type {
- CE = 1
+ CE = 1,
+ UCE
};
struct cpu_info {
+ unsigned long uce_nums;
unsigned long ce_nums;
struct link_queue *ce_queue;
enum cpu_state state;
--
2.25.1