From 9e2d3f84c4f158dd58bce4a30eec568331749501 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 20:05:49 +0800 Subject: [PATCH] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline the error cpu according to threshold algorithm. Signed-off-by: Luo Shengwei --- Makefile.am | 6 +- configure.ac | 11 + misc/rasdaemon.env | 17 ++ queue.c | 126 +++++++++++ queue.h | 43 ++++ ras-arm-handler.c | 73 +++++++ ras-cpu-isolation.c | 499 ++++++++++++++++++++++++++++++++++++++++++++ ras-cpu-isolation.h | 76 +++++++ ras-events.c | 8 + ras-record.h | 5 + 10 files changed, 863 insertions(+), 1 deletion(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c create mode 100644 ras-cpu-isolation.h diff --git a/Makefile.am b/Makefile.am index fabca78..242ceb7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -63,13 +63,17 @@ endif if WITH_AMP_NS_DECODE rasdaemon_SOURCES += non-standard-ampere.c endif +if WITH_CPU_FAULT_ISOLATION + rasdaemon_SOURCES += ras-cpu-isolation.c queue.c +endif rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ - non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cpu-isolation.h queue.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac index 33b81fe..d098fcf 100644 --- a/configure.ac +++ b/configure.ac @@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) +AC_ARG_ENABLE([cpu_fault_isolation], + AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) + +AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation") + AC_SUBST([WITH_CPU_FAULT_ISOLATION]) +]) +AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) +AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" @@ -201,4 +211,5 @@ compile time options summary Memory Failure : $USE_MEMORY_FAILURE Memory CE PFA : $USE_MEMORY_CE_PFA AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env index 12fd766..3191d03 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation +# Whether to enable cpu online fault isolation (yes|no). +CPU_ISOLATION_ENABLE="no" +# Specify the threshold of CE numbers. +# +# Format: +# [0-9]+[unit] +# +# Supported units: +# CPU_CE_THRESHOLD: no unit +# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second +CPU_CE_THRESHOLD="18" +CPU_ISOLATION_CYCLE="24h" + +# Prevent excessive isolation from causing an avalanche effect +CPU_ISOLATION_LIMIT="10" diff --git a/queue.c b/queue.c new file mode 100644 index 0000000..92f3d3c --- /dev/null +++ b/queue.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ +#include +#include +#include "queue.h" +#include "ras-logger.h" + + +int is_empty(struct link_queue *queue) +{ + if (queue) { + return queue->size == 0; + } + + return 1; +} + +struct link_queue* init_queue(void) +{ + struct link_queue* queue; + queue = (struct link_queue*) malloc(sizeof(struct link_queue)); + + if (queue == NULL) { + log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); + return NULL; + } + + queue->size = 0; + queue->head = NULL; + queue->tail = NULL; + + return queue; +} + +void clear_queue(struct link_queue *queue) +{ + if (queue == NULL) { + return; + } + + struct queue_node *node = queue->head; + struct queue_node *tmp = NULL; + + while (node != NULL) { + tmp = node; + node = node->next; + free(tmp); + } + + queue->head = NULL; + queue->tail = NULL; + queue->size = 0; +} + +void free_queue(struct link_queue *queue) { + clear_queue(queue); + + if (queue) { + free(queue); + } +} + +/* It should be guranteed that the param is not NULL */ +void push(struct link_queue *queue, struct queue_node *node) +{ + /* there is no element in the queue */ + if (queue->head == NULL) { + queue->head = node; + } + else { + node->next = queue->tail->next; + queue->tail->next = node; + } + + queue->tail = node; + (queue->size)++; +} + +int pop(struct link_queue *queue) +{ + if (queue == NULL || is_empty(queue)) { + return -1; + } + + struct queue_node *tmp = NULL; + tmp = queue->head; + queue->head = queue->head->next; + free(tmp); + (queue->size)--; + + return 0; +} + +struct queue_node* front(struct link_queue *queue) +{ + if (queue == NULL) { + return NULL; + } + + return queue->head; +} + +struct queue_node* node_create(time_t time, unsigned value) +{ + struct queue_node *node = NULL; + node = (struct queue_node*) malloc(sizeof(struct queue_node)); + + if (node != NULL) { + node->time = time; + node->value = value; + node->next = NULL; + } + + return node; +} diff --git a/queue.h b/queue.h new file mode 100644 index 0000000..9684c58 --- /dev/null +++ b/queue.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#ifndef __RAS_QUEUE_H +#define __RAS_QUEUE_H + + +struct queue_node +{ + time_t time; + unsigned value; + struct queue_node *next; +}; + +struct link_queue +{ + struct queue_node *head; + struct queue_node *tail; + int size; +}; + +int is_empty(struct link_queue *queue); +struct link_queue* init_queue(void); +void clear_queue(struct link_queue *queue); +void free_queue(struct link_queue *queue); +void push(struct link_queue *queue, struct queue_node *node); +int pop(struct link_queue *queue); +struct queue_node* front(struct link_queue *queue); +struct queue_node* node_create(time_t time, unsigned value); + + +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c index 1149dc6..a64f20b 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -22,6 +22,44 @@ #include "ras-report.h" #include "ras-non-standard-handler.h" #include "non-standard-ampere.h" +#include "ras-cpu-isolation.h" + +#ifdef HAVE_CPU_FAULT_ISOLATION +static int is_core_failure(unsigned long value) +{ + /* + * core failure: + * Bit 0\1\3: (at lease 1) + * Bit 2: 0 + */ + return (value & 0xf) && !(value & (0x1 << 2)); +} + +static int count_errors(struct event_format *event, const uint8_t *data, int len) +{ + /* + * According to UEFI_2_9_2021_03_18 specification chapter N2.4.4, + * the length of struct processor error information is 32, the byte + * length of the Flags field is 1, and the byte offset is 7 in the struct. + */ + int cur_offset = 7; + unsigned long value; + int num = 0; + if (len % PEI_ERR_SIZE != 0) { + log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); + return num; + } + while (cur_offset < len) { + value = pevent_read_number(event->pevent, data+cur_offset, FLAGS_SIZE); + if (is_core_failure(value)) { + num++; + log(TERM, LOG_INFO, "Error in cpu core catched\n"); + } + cur_offset += PEI_ERR_SIZE; + } + return num; +} +#endif void display_raw_data(struct trace_seq *s, const uint8_t *buf, @@ -139,6 +177,41 @@ int ras_arm_event_handler(struct trace_seq *s, display_raw_data(s, ev.vsei_error, ev.oem_len); #endif +#ifdef HAVE_CPU_FAULT_ISOLATION + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; + /* refer to UEFI_2_9_2021_03_18 specification chapter N2.2 Table N-5 */ + switch (val) { + case GHES_SEV_NO: + ev.severity = "Informational"; + break; + case GHES_SEV_CORRECTED: + ev.severity = "Corrected"; + break; + case GHES_SEV_RECOVERABLE: + ev.severity = "Recoverable"; + break; + default: + case GHES_SEV_PANIC: + ev.severity = "Fatal"; + } + + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { + int len, nums; + ev.error_info = pevent_get_field_raw(s, event, "buf", record, &len, 1); + if (!ev.error_info) + return -1; + ev.length = len; + /* relate to enum error_type */ + nums = count_errors(event, ev.error_info, len); + if (nums > 0) { + struct error_info err_info = {nums, now, val}; + ras_record_cpu_error(&err_info, ev.mpidr); + } + } +#endif + /* Insert data into the SGBD */ #ifdef HAVE_SQLITE3 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 index 0000000..6dcff70 --- /dev/null +++ b/ras-cpu-isolation.c @@ -0,0 +1,499 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + +static struct cpu_info *cpu_infos = NULL; +static unsigned int ncores, cores_per_socket, cores_per_die; +static unsigned int sockets, dies = 1; +static unsigned int enabled = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { + { "", 1 }, + {} +}; + +static const struct param cycle_units[] = { + { "d", 24 * 60 * 60 }, + { "h", 60 * 60 }, + { "m", 60 }, + { "s", 1 }, + {} +}; + +static struct isolation_param threshold = { + .name = "CPU_CE_THRESHOLD", + .units = normal_units, + .value = 18, + .limit = 10000 +}; + +static struct isolation_param cpu_limit = { + .name = "CPU_ISOLATION_LIMIT", + .units = normal_units +}; + +static struct isolation_param cycle = { + .name = "CPU_ISOLATION_CYCLE", + .units = cycle_units, + .value = 24 * 60 * 60, + .limit = 30 * 24 * 60 * 60 +}; + +static const char *cpu_state[] = { + [CPU_OFFLINE] = "offline", + [CPU_ONLINE] = "online", + [CPU_OFFLINE_FAILED] = "offline-failed", + [CPU_UNKNOWN] = "unknown" +}; + +static int open_sys_file(unsigned cpu, int __oflag, const char *format) +{ + int fd; + char buf[MAX_PATH_LEN] = ""; + snprintf(buf, sizeof(buf), format, cpu); + fd = open(buf, __oflag); + + if (fd == -1) { + log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); + return -1; + } + + return fd; +} + +static int get_sockets(void) +{ + int fd, j; + char buf[MAX_BUF_LEN] = ""; + cores_per_socket = ncores; + struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); + + if (!cpu_sets) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); + return -1; + } + + for (int i = 0; i < ncores; ++i) { + fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); + if (fd == -1) { + continue; + } + memset(buf, '\0', strlen(buf)); + if (read(fd, buf, sizeof(buf)) <= 0) { + close(fd); + continue; + } + for (j = 0; j < sockets; ++j) { + if (strcmp(cpu_sets[j].buf, buf) == 0) { + break; + } + } + if (j == sockets) { + strcpy(cpu_sets[sockets].buf, buf); + sockets++; + } + close(fd); + } + + free(cpu_sets); + cores_per_socket = sockets > 0 ? ncores / sockets : ncores; + + return 0; +} + +static int get_dies(void) +{ + int fd, begin, end; + char buf[20] = ""; + cores_per_die = ncores; + fd = open(node_path, O_RDONLY); + + if (fd == -1) { + return -1; + } + + if (read(fd, buf, sizeof(buf))) { + if (sscanf(buf, "%d-%d", &begin, &end) == 2) { + dies = end > begin ? end - begin + 1 : 1; + } + } + + close(fd); + cores_per_die = ncores / dies; + + return 0; +} + +static int get_cpu_status(unsigned cpu) +{ + int fd, num; + char buf[2] = ""; + fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); + + if (fd == -1) { + return CPU_UNKNOWN; + } + + if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) { + num = CPU_UNKNOWN; + } + + close(fd); + + return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; +} + +static int init_cpu_info(unsigned cpus) +{ + ncores = cpus; + cpu_infos = (struct cpu_info *) malloc(sizeof(*cpu_infos) * cpus); + + if (!cpu_infos) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); + return -1; + } + + for (unsigned int i = 0; i < cpus; ++i) { + cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); + if (cpu_infos[i].ce_queue == NULL) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu ce queue in %s.\n", __func__); + return -1; + } + } + /* set limit of offlined cpu limit according to number of cpu */ + cpu_limit.limit = cpus - 1; + cpu_limit.value = 0; + + if (get_sockets() < 0 || get_dies() < 0) { + log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); + return -1; + } + + return 0; +} + +static void check_config(struct isolation_param *config) +{ + if (config->value > config->limit) { + log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", + config->value, config->limit); + config->value = config->limit; + } +} + +static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) +{ + int env_size, has_unit = 0; + + if (!env || strlen(env) == 0) { + return -1; + } + + env_size = strlen(env); + char *unit = NULL; + unit = env + env_size - 1; + + if (isalpha(*unit)) { + has_unit = 1; + env_size--; + if (env_size <= 0) { + return -1; + } + } + + for (int i = 0; i < env_size; ++i) { + if (isdigit(env[i])) { + if (*value > ULONG_MAX / 10 || (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { + log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); + return -1; + } + *value = 10 * (*value) + (env[i] - '0'); + } + else { + return -1; + } + } + + if (has_unit) { + for (const struct param *units = config->units; units->name; units++) { + /* value character and unit character are both valid */ + if (!strcasecmp(unit, units->name)) { + if (*value > (ULONG_MAX / units->value)) { + log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); + return -1; + } + *value = (*value) * units->value; + return 0; + } + } + log(TERM, LOG_ERR, "Invalid unit %s\n", unit); + return -1; + } + + return 0; +} + +static void init_config(struct isolation_param *config) +{ + char *env = getenv(config->name); + unsigned long value = 0; + + if (parse_ul_config(config, env, &value) < 0) { + log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", + config->name, env, config->value); + return; + } + + config->value = value; + check_config(config); +} + +static int check_config_status(void) +{ + char *env = getenv("CPU_ISOLATION_ENABLE"); + + if (env == NULL || strcasecmp(env, "yes")) { + return -1; + } + + return 0; +} + +void ras_error_count_init(unsigned cpus) +{ + if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { + enabled = 0; + log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); + return; + } + + log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); + init_config(&threshold); + init_config(&cpu_limit); + init_config(&cycle); +} + +void cpu_infos_free(void) +{ + if (cpu_infos) { + for (int i = 0; i < ncores; ++i) { + free_queue(cpu_infos[i].ce_queue); + } + free(cpu_infos); + } +} + +static int do_cpu_offline(unsigned cpu) +{ + int fd, rc; + char buf[2] = ""; + cpu_infos[cpu].state = CPU_OFFLINE_FAILED; + fd = open_sys_file(cpu, O_RDWR, cpu_path_format); + + if (fd == -1) { + return HANDLE_FAILED; + } + + strcpy(buf, "0"); + rc = write(fd, buf, strlen(buf)); + + if (rc < 0) { + log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); + close(fd); + return HANDLE_FAILED; + } + + close(fd); + /* check wthether the cpu is isolated successfully */ + cpu_infos[cpu].state = get_cpu_status(cpu); + + if (cpu_infos[cpu].state == CPU_OFFLINE) { + return HANDLE_SUCCEED; + } + + return HANDLE_FAILED; +} + +static int do_ce_handler(unsigned cpu) +{ + struct link_queue *queue = cpu_infos[cpu].ce_queue; + unsigned tmp; + /* + * Since we just count all error numbers in setted cycle, we store the time + * and error numbers from current event to the queue, then everytime we + * calculate the period from beginning time to ending time, if the period + * exceeds setted cycle, we pop the beginning time and error until the period + * from new beginning time to ending time is less than cycle. + */ + while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { + tmp = queue->head->value; + if (pop(queue) == 0) { + cpu_infos[cpu].ce_nums -= tmp; + } + } + + if (cpu_infos[cpu].ce_nums >= threshold.value) { + log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", + threshold.value, cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; +} + +static int do_uce_handler(unsigned cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { + log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; +} + +static int error_handler(unsigned cpu, struct error_info *err_info) +{ + int ret = HANDLE_NOTHING; + + switch (err_info->err_type) + { + case CE: + ret = do_ce_handler(cpu); + break; + case UCE: + ret = do_uce_handler(cpu); + break; + default: + break; + } + + return ret; +} + +static void record_error_info(unsigned cpu, struct error_info *err_info) +{ + switch (err_info->err_type) + { + case CE: + { + struct queue_node *node = NULL; + node = node_create(err_info->time, err_info->nums); + if (node == NULL) { + log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); + return; + } + push(cpu_infos[cpu].ce_queue, node); + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } + case UCE: + cpu_infos[cpu].uce_nums++; + break; + default: + break; + } +} + +static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) +{ + value >>= offset; + unsigned long res = 0; + int i = 0; + + while (i < size) { + res |= (value & (0x1 << (i++))); + } + + return res; +} + +static unsigned get_cpu_index(int64_t mpidr) +{ + unsigned core_id, socket_id, die_id, cpu; + /* + * Adapt to certain BIOS + * In the MPIDR: + * bit 8:15: core id + * bit 19:20: die_id + * bit 21:22: socket_id + */ + core_id = get_bit_value(mpidr, 8, 8); + socket_id = get_bit_value(mpidr, 21, 2); + die_id = get_bit_value(mpidr, 19, 2); + cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; + + return cpu; +} + +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) +{ + unsigned cpu; + int ret; + + if (enabled == 0) { + return; + } + + cpu = get_cpu_index(mpidr); + + if (cpu >= ncores) { + log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); + return; + } + + log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); + cpu_infos[cpu].state = get_cpu_status(cpu); + + if (cpu_infos[cpu].state != CPU_ONLINE) { + log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); + return; + } + + record_error_info(cpu, err_info); + /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ + if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { + log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", + cpu_limit.value); + return; + } + + ret = error_handler(cpu, err_info); + + if (ret == HANDLE_NOTHING) { + log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); + } + else if (ret == HANDLE_SUCCEED) { + log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", + cpu, cpu_state[cpu_infos[cpu].state]); + clear_queue(cpu_infos[cpu].ce_queue); + } + else { + log(TERM, LOG_INFO, "Offline cpu%d fail, the state is %s\n", + cpu, cpu_state[cpu_infos[cpu].state]); + } + + return; +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 index 0000000..a7d3fdb --- /dev/null +++ b/ras-cpu-isolation.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#ifndef __RAS_CPU_ISOLATION_H +#define __RAS_CPU_ISOLATION_H + +#include "queue.h" + +#define MAX_PATH_LEN 100 +#define MAX_BUF_LEN 1024 +#define PEI_ERR_SIZE 32 +#define FLAGS_SIZE 1 + +struct param { + char *name; + unsigned long value; +}; + +struct isolation_param { + char *name; + const struct param *units; + unsigned long value; + unsigned long limit; +}; + +enum cpu_state { + CPU_OFFLINE, + CPU_ONLINE, + CPU_OFFLINE_FAILED, + CPU_UNKNOWN, +}; + +enum error_handle_result { + HANDLE_FAILED = -1, + HANDLE_SUCCEED, + HANDLE_NOTHING, +}; + +enum error_type { + CE = 1, + UCE +}; + +struct cpu_info { + unsigned long uce_nums; + unsigned long ce_nums; + struct link_queue *ce_queue; + enum cpu_state state; +}; + +struct error_info { + unsigned long nums; + time_t time; + enum error_type err_type; +}; + +struct cpu_set { + char buf[MAX_BUF_LEN]; +}; + +void ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void cpu_infos_free(void); + +#endif \ No newline at end of file diff --git a/ras-events.c b/ras-events.c index ba769d1..00938e6 100644 --- a/ras-events.c +++ b/ras-events.c @@ -41,6 +41,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" +#include "ras-cpu-isolation.h" /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never @@ -879,6 +880,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION + ras_error_count_init(cpus); +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) @@ -1005,6 +1010,9 @@ err: } free(ras); } +#ifdef HAVE_CPU_FAULT_ISOLATION + cpu_infos_free(); +#endif return rc; } diff --git a/ras-record.h b/ras-record.h index d9f7733..efaffa5 100644 --- a/ras-record.h +++ b/ras-record.h @@ -83,6 +83,11 @@ struct ras_arm_event { uint32_t ctx_len; const uint8_t *vsei_error; uint32_t oem_len; +#ifdef HAVE_CPU_FAULT_ISOLATION + const char *severity; + const uint8_t *error_info; + uint32_t length; +#endif }; struct devlink_event { -- 2.27.0