From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 17:23:27 +0800 Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors When the recoverable errors in cpu core occurred, try to offline the related cpu core. Signed-off-by: Shengwei Luo --- ras-arm-handler.c | 21 ++++++++++++++++++--- ras-cpu-isolation.c | 17 +++++++++++++++++ ras-cpu-isolation.h | 4 +++- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/ras-arm-handler.c b/ras-arm-handler.c index c9ef2fd..dae5ad6 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s, } #ifdef HAVE_CPU_FAULT_ISOLATION -static int count_errors(struct ras_arm_event *ev) +static int is_core_failure(struct ras_arm_err_info *err_info) +{ + if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { + /* + * core failure: + * Bit 0\1\3: (at lease 1) + * Bit 2: 0 + */ + return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2)); + } + return 0; +} + +static int count_errors(struct ras_arm_event *ev, int sev) { struct ras_arm_err_info *err_info; int num_pei; @@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev) */ error_count = err_info->multiple_error + 1; } + if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) + error_count = 0; num += error_count; err_info += 1; @@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s, } trace_seq_printf(s, "\n severity: %s", severity); - if (val == GHES_SEV_CORRECTED) { - nums = count_errors(&ev); + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { + nums = count_errors(&ev, val); if (nums > 0) { err_info.nums = nums; err_info.time = now; diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c index 8c0cdf9..e650022 100644 --- a/ras-cpu-isolation.c +++ b/ras-cpu-isolation.c @@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus) for (unsigned int i = 0; i < cpus; ++i) { cpu_infos[i].ce_nums = 0; + cpu_infos[i].uce_nums = 0; cpu_infos[i].state = get_cpu_status(i); cpu_infos[i].ce_queue = init_queue(); @@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu) return HANDLE_NOTHING; } +static int do_uce_handler(unsigned int cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { + log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; +} + static int error_handler(unsigned int cpu, struct error_info *err_info) { int ret = HANDLE_NOTHING; @@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) case CE: ret = do_ce_handler(cpu); break; + case UCE: + ret = do_uce_handler(cpu); + break; default: break; } @@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) cpu_infos[cpu].ce_nums += err_info->nums; break; } + case UCE: + cpu_infos[cpu].uce_nums++; + break; default: break; } @@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) cpu, cpu_state[cpu_infos[cpu].state]); clear_queue(cpu_infos[cpu].ce_queue); cpu_infos[cpu].ce_nums = 0; + cpu_infos[cpu].uce_nums = 0; } else log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", cpu, cpu_state[cpu_infos[cpu].state]); diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h index 1159853..024a68b 100644 --- a/ras-cpu-isolation.h +++ b/ras-cpu-isolation.h @@ -46,10 +46,12 @@ enum error_handle_result { }; enum error_type { - CE = 1 + CE = 1, + UCE }; struct cpu_info { + unsigned long uce_nums; unsigned long ce_nums; struct link_queue *ce_queue; enum cpu_state state; -- 2.27.0