rasdaemon/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch
Shiju Jose f9eb9d8c57 rasdaemon: Update with the latest patches for the CPU fault isolation, Hisilicon Kunpeng9xx common error records and improvements in the ras-mc-ctl for the Hisilicon Kunpeng9xx errors
Update with the latest patches for the
1. CPU online fault isolation for arm event.
2. Modify recording Hisilicon common error data in the rasdaemon
3. In the ras-mc-ctl,
3.1. Improve Hisilicon common error statistics.
3.2. Add support to display the HiSilicon vendor-errors for a specified module.
3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
3.4. Reformat error info of the HiSilicon Kunpeng920.
3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
3.6. Updated the HiSilicon platform name as KunPeng9xx.
4. Fixed a memory out-of-bounds issue in the rasdaemon.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
2022-05-30 09:30:03 +01:00

939 lines
24 KiB
Diff

From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001
From: Shengwei Luo <luoshengwei@huawei.com>
Date: Wed, 23 Feb 2022 17:21:58 +0800
Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected
errors
When the corrected errors exceed the set limit in cycle, try to
offline the related cpu core.
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
Makefile.am | 6 +-
configure.ac | 11 ++
misc/rasdaemon.env | 17 ++
queue.c | 119 ++++++++++++++
queue.h | 39 +++++
ras-arm-handler.c | 97 +++++++++++
ras-arm-handler.h | 18 ++
ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++
ras-cpu-isolation.h | 68 ++++++++
ras-events.c | 9 +-
10 files changed, 770 insertions(+), 2 deletions(-)
create mode 100644 queue.c
create mode 100644 queue.h
create mode 100644 ras-cpu-isolation.c
create mode 100644 ras-cpu-isolation.h
diff --git a/Makefile.am b/Makefile.am
index a322b9a..36e7d4e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -69,13 +69,17 @@ endif
if WITH_AMP_NS_DECODE
rasdaemon_SOURCES += non-standard-ampere.c
endif
+if WITH_CPU_FAULT_ISOLATION
+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
+endif
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ ras-cpu-isolation.h queue.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index a77991f..e0ed751 100644
--- a/configure.ac
+++ b/configure.ac
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
+AC_ARG_ENABLE([cpu_fault_isolation],
+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation]))
+
+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation")
+ AC_SUBST([WITH_CPU_FAULT_ISOLATION])
+])
+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
+
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
@@ -201,4 +211,5 @@ compile time options summary
Memory Failure : $USE_MEMORY_FAILURE
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION
EOF
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index 12fd766..7cb18e8 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50"
# soft-then-hard First try to soft offline, then try hard offlining.
# Note: default offline choice is "soft".
PAGE_CE_ACTION="soft"
+
+# CPU Online Fault Isolation
+# Whether to enable cpu online fault isolation (yes|no).
+CPU_ISOLATION_ENABLE="no"
+# Specify the threshold of CE numbers.
+#
+# Format:
+# [0-9]+[unit]
+#
+# Supported units:
+# CPU_CE_THRESHOLD: no unit
+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second
+CPU_CE_THRESHOLD="18"
+CPU_ISOLATION_CYCLE="24h"
+
+# Prevent excessive isolation from causing an avalanche effect
+CPU_ISOLATION_LIMIT="10"
\ No newline at end of file
diff --git a/queue.c b/queue.c
new file mode 100644
index 0000000..65b6fb8
--- /dev/null
+++ b/queue.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "queue.h"
+#include "ras-logger.h"
+
+int is_empty(struct link_queue *queue)
+{
+ if (queue)
+ return queue->size == 0;
+
+ return 1;
+}
+
+struct link_queue *init_queue(void)
+{
+ struct link_queue *queue = NULL;
+
+ queue = (struct link_queue *)malloc(sizeof(struct link_queue));
+ if (queue == NULL) {
+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n");
+ return NULL;
+ }
+
+ queue->size = 0;
+ queue->head = NULL;
+ queue->tail = NULL;
+
+ return queue;
+}
+
+void clear_queue(struct link_queue *queue)
+{
+ if (queue == NULL)
+ return;
+
+ struct queue_node *node = queue->head;
+ struct queue_node *tmp = NULL;
+
+ while (node != NULL) {
+ tmp = node;
+ node = node->next;
+ free(tmp);
+ }
+
+ queue->head = NULL;
+ queue->tail = NULL;
+ queue->size = 0;
+}
+
+void free_queue(struct link_queue *queue)
+{
+ clear_queue(queue);
+
+ if (queue)
+ free(queue);
+}
+
+/* It should be guranteed that the param is not NULL */
+void push(struct link_queue *queue, struct queue_node *node)
+{
+ /* there is no element in the queue */
+ if (queue->head == NULL)
+ queue->head = node;
+ else
+ queue->tail->next = node;
+
+ queue->tail = node;
+ (queue->size)++;
+}
+
+int pop(struct link_queue *queue)
+{
+ struct queue_node *tmp = NULL;
+
+ if (queue == NULL || is_empty(queue))
+ return -1;
+
+ tmp = queue->head;
+ queue->head = queue->head->next;
+ free(tmp);
+ (queue->size)--;
+
+ return 0;
+}
+
+struct queue_node *front(struct link_queue *queue)
+{
+ if (queue == NULL)
+ return NULL;
+
+ return queue->head;
+}
+
+struct queue_node *node_create(time_t time, unsigned int value)
+{
+ struct queue_node *node = NULL;
+
+ node = (struct queue_node *)malloc(sizeof(struct queue_node));
+ if (node != NULL) {
+ node->time = time;
+ node->value = value;
+ node->next = NULL;
+ }
+
+ return node;
+}
diff --git a/queue.h b/queue.h
new file mode 100644
index 0000000..5459f40
--- /dev/null
+++ b/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAS_QUEUE_H
+#define __RAS_QUEUE_H
+
+struct queue_node {
+ time_t time;
+ unsigned int value;
+ struct queue_node *next;
+};
+
+struct link_queue {
+ struct queue_node *head;
+ struct queue_node *tail;
+ int size;
+};
+
+int is_empty(struct link_queue *queue);
+struct link_queue *init_queue(void);
+void clear_queue(struct link_queue *queue);
+void free_queue(struct link_queue *queue);
+void push(struct link_queue *queue, struct queue_node *node);
+int pop(struct link_queue *queue);
+struct queue_node *front(struct link_queue *queue);
+struct queue_node *node_create(time_t time, unsigned int value);
+
+#endif
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 1149dc6..9c7a3c3 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -22,6 +22,10 @@
#include "ras-report.h"
#include "ras-non-standard-handler.h"
#include "non-standard-ampere.h"
+#include "ras-cpu-isolation.h"
+
+#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
+#define ARM_ERR_VALID_FLAGS BIT(1)
void display_raw_data(struct trace_seq *s,
const uint8_t *buf,
@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s,
}
}
+#ifdef HAVE_CPU_FAULT_ISOLATION
+static int count_errors(struct ras_arm_event *ev)
+{
+ struct ras_arm_err_info *err_info;
+ int num_pei;
+ int err_info_size = sizeof(struct ras_arm_err_info);
+ int num = 0;
+ int i;
+ int error_count;
+
+ if (ev->pei_len % err_info_size != 0) {
+ log(TERM, LOG_ERR,
+ "The event data does not match to the ARM Processor Error Information Structure\n");
+ return num;
+ }
+ num_pei = ev->pei_len / err_info_size;
+ err_info = (struct ras_arm_err_info *)(ev->pei_error);
+
+ for (i = 0; i < num_pei; ++i) {
+ error_count = 1;
+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
+ /*
+ * The value of this field is defined as follows:
+ * 0: Single Error
+ * 1: Multiple Errors
+ * 2-65535: Error Count
+ */
+ error_count = err_info->multiple_error + 1;
+ }
+
+ num += error_count;
+ err_info += 1;
+ }
+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num);
+ return num;
+}
+
+static int ras_handle_cpu_error(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event,
+ struct ras_arm_event *ev, time_t now)
+{
+ unsigned long long val;
+ int cpu;
+ char *severity;
+ struct error_info err_info;
+
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
+ return -1;
+ cpu = val;
+ trace_seq_printf(s, "\n cpu: %d", cpu);
+
+ /* record cpu error */
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
+ return -1;
+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */
+ switch (val) {
+ case GHES_SEV_NO:
+ severity = "Informational";
+ break;
+ case GHES_SEV_CORRECTED:
+ severity = "Corrected";
+ break;
+ case GHES_SEV_RECOVERABLE:
+ severity = "Recoverable";
+ break;
+ default:
+ case GHES_SEV_PANIC:
+ severity = "Fatal";
+ }
+ trace_seq_printf(s, "\n severity: %s", severity);
+
+ if (val == GHES_SEV_CORRECTED) {
+ int nums = count_errors(ev);
+
+ if (nums > 0) {
+ err_info.nums = nums;
+ err_info.time = now;
+ err_info.err_type = val;
+ ras_record_cpu_error(&err_info, cpu);
+ }
+ }
+
+ return 0;
+}
+#endif
+
int ras_arm_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s,
struct tm *tm;
struct ras_arm_event ev;
int len = 0;
+
memset(&ev, 0, sizeof(ev));
/*
@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s,
display_raw_data(s, ev.vsei_error, ev.oem_len);
#endif
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0)
+ return -1;
+#endif
+
/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_arm_record(ras, &ev);
diff --git a/ras-arm-handler.h b/ras-arm-handler.h
index 563a2d3..52813e7 100644
--- a/ras-arm-handler.h
+++ b/ras-arm-handler.h
@@ -17,6 +17,24 @@
#include "ras-events.h"
#include "libtrace/event-parse.h"
+/*
+ * ARM Processor Error Information Structure, According to
+ * UEFI_2_9 specification chapter N2.4.4.
+ */
+#pragma pack(1)
+struct ras_arm_err_info {
+ uint8_t version;
+ uint8_t length;
+ uint16_t validation_bits;
+ uint8_t type;
+ uint16_t multiple_error;
+ uint8_t flags;
+ uint64_t error_info;
+ uint64_t virt_fault_addr;
+ uint64_t physical_fault_addr;
+};
+#pragma pack()
+
int ras_arm_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context);
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
new file mode 100644
index 0000000..abcf451
--- /dev/null
+++ b/ras-cpu-isolation.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <limits.h>
+#include <ctype.h>
+#include "ras-logger.h"
+#include "ras-cpu-isolation.h"
+
+#define SECOND_OF_MON (30 * 24 * 60 * 60)
+#define SECOND_OF_DAY (24 * 60 * 60)
+#define SECOND_OF_HOU (60 * 60)
+#define SECOND_OF_MIN (60)
+
+#define LIMIT_OF_CPU_THRESHOLD 10000
+#define INIT_OF_CPU_THRESHOLD 18
+#define DEC_CHECK 10
+#define LAST_BIT_OF_UL 5
+
+static struct cpu_info *cpu_infos;
+static unsigned int ncores;
+static unsigned int enabled = 1;
+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
+
+static const struct param normal_units[] = {
+ {"", 1},
+ {}
+};
+
+static const struct param cycle_units[] = {
+ {"d", SECOND_OF_DAY},
+ {"h", SECOND_OF_HOU},
+ {"m", SECOND_OF_MIN},
+ {"s", 1},
+ {}
+};
+
+static struct isolation_param threshold = {
+ .name = "CPU_CE_THRESHOLD",
+ .units = normal_units,
+ .value = INIT_OF_CPU_THRESHOLD,
+ .limit = LIMIT_OF_CPU_THRESHOLD
+};
+
+static struct isolation_param cpu_limit = {
+ .name = "CPU_ISOLATION_LIMIT",
+ .units = normal_units
+};
+
+static struct isolation_param cycle = {
+ .name = "CPU_ISOLATION_CYCLE",
+ .units = cycle_units,
+ .value = SECOND_OF_DAY,
+ .limit = SECOND_OF_MON
+};
+
+static const char * const cpu_state[] = {
+ [CPU_OFFLINE] = "offline",
+ [CPU_ONLINE] = "online",
+ [CPU_OFFLINE_FAILED] = "offline-failed",
+ [CPU_UNKNOWN] = "unknown"
+};
+
+static int open_sys_file(unsigned int cpu, int __oflag, const char *format)
+{
+ int fd;
+ char path[MAX_PATH_LEN + 1] = "";
+ char real_path[MAX_PATH_LEN + 1] = "";
+
+ snprintf(path, sizeof(path), format, cpu);
+ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+ fd = open(real_path, __oflag);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int get_cpu_status(unsigned int cpu)
+{
+ int fd, num;
+ char buf[2] = "";
+
+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format);
+ if (fd == -1)
+ return CPU_UNKNOWN;
+
+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1)
+ num = CPU_UNKNOWN;
+
+ close(fd);
+
+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num;
+}
+
+static int init_cpu_info(unsigned int cpus)
+{
+ ncores = cpus;
+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus);
+ if (!cpu_infos) {
+ log(TERM, LOG_ERR,
+ "Failed to allocate memory for cpu infos in %s.\n", __func__);
+ return -1;
+ }
+
+ for (unsigned int i = 0; i < cpus; ++i) {
+ cpu_infos[i].ce_nums = 0;
+ cpu_infos[i].state = get_cpu_status(i);
+ cpu_infos[i].ce_queue = init_queue();
+
+ if (cpu_infos[i].ce_queue == NULL) {
+ log(TERM, LOG_ERR,
+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__);
+ return -1;
+ }
+ }
+ /* set limit of offlined cpu limit according to number of cpu */
+ cpu_limit.limit = cpus - 1;
+ cpu_limit.value = 0;
+
+ return 0;
+}
+
+static void check_config(struct isolation_param *config)
+{
+ if (config->value > config->limit) {
+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n",
+ config->value, config->limit);
+ config->value = config->limit;
+ }
+}
+
+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value)
+{
+ char *unit = NULL;
+ int env_size, has_unit = 0;
+
+ if (!env || strlen(env) == 0)
+ return -1;
+
+ env_size = strlen(env);
+ unit = env + env_size - 1;
+
+ if (isalpha(*unit)) {
+ has_unit = 1;
+ env_size--;
+ if (env_size <= 0)
+ return -1;
+ }
+
+ for (int i = 0; i < env_size; ++i) {
+ if (isdigit(env[i])) {
+ if (*value > ULONG_MAX / DEC_CHECK ||
+ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) {
+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX);
+ return -1;
+ }
+ *value = DEC_CHECK * (*value) + (env[i] - '0');
+ } else
+ return -1;
+ }
+
+ if (!has_unit)
+ return 0;
+
+ for (const struct param *units = config->units; units->name; units++) {
+ /* value character and unit character are both valid */
+ if (!strcasecmp(unit, units->name)) {
+ if (*value > (ULONG_MAX / units->value)) {
+ log(TERM, LOG_ERR,
+ "%s is out of range: %lu\n", env, ULONG_MAX);
+ return -1;
+ }
+ *value = (*value) * units->value;
+ return 0;
+ }
+ }
+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit);
+ return -1;
+}
+
+static void init_config(struct isolation_param *config)
+{
+ char *env = getenv(config->name);
+ unsigned long value = 0;
+
+ if (parse_ul_config(config, env, &value) < 0) {
+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n",
+ config->name, env, config->value);
+ return;
+ }
+
+ config->value = value;
+ check_config(config);
+}
+
+static int check_config_status(void)
+{
+ char *env = getenv("CPU_ISOLATION_ENABLE");
+
+ if (env == NULL || strcasecmp(env, "yes"))
+ return -1;
+
+ return 0;
+}
+
+void ras_cpu_isolation_init(unsigned int cpus)
+{
+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) {
+ enabled = 0;
+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n");
+ return;
+ }
+
+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n");
+ init_config(&threshold);
+ init_config(&cpu_limit);
+ init_config(&cycle);
+}
+
+void cpu_infos_free(void)
+{
+ if (cpu_infos) {
+ for (int i = 0; i < ncores; ++i)
+ free_queue(cpu_infos[i].ce_queue);
+
+ free(cpu_infos);
+ }
+}
+
+static int do_cpu_offline(unsigned int cpu)
+{
+ int fd, rc;
+ char buf[2] = "";
+
+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED;
+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format);
+ if (fd == -1)
+ return HANDLE_FAILED;
+
+ strcpy(buf, "0");
+ rc = write(fd, buf, strlen(buf));
+ if (rc < 0) {
+ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno);
+ close(fd);
+ return HANDLE_FAILED;
+ }
+
+ close(fd);
+ /* check wthether the cpu is isolated successfully */
+ cpu_infos[cpu].state = get_cpu_status(cpu);
+
+ if (cpu_infos[cpu].state == CPU_OFFLINE)
+ return HANDLE_SUCCEED;
+
+ return HANDLE_FAILED;
+}
+
+static int do_ce_handler(unsigned int cpu)
+{
+ struct link_queue *queue = cpu_infos[cpu].ce_queue;
+ unsigned int tmp;
+ /*
+ * Since we just count all error numbers in setted cycle, we store the time
+ * and error numbers from current event to the queue, then everytime we
+ * calculate the period from beginning time to ending time, if the period
+ * exceeds setted cycle, we pop the beginning time and error until the period
+ * from new beginning time to ending time is less than cycle.
+ */
+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) {
+ tmp = queue->head->value;
+ if (pop(queue) == 0)
+ cpu_infos[cpu].ce_nums -= tmp;
+ }
+ log(TERM, LOG_INFO,
+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n",
+ cpu, cpu_infos[cpu].ce_nums);
+
+ if (cpu_infos[cpu].ce_nums >= threshold.value) {
+ log(TERM, LOG_INFO,
+ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n",
+ threshold.value, cpu);
+ return do_cpu_offline(cpu);
+ }
+ return HANDLE_NOTHING;
+}
+
+static int error_handler(unsigned int cpu, struct error_info *err_info)
+{
+ int ret = HANDLE_NOTHING;
+
+ switch (err_info->err_type) {
+ case CE:
+ ret = do_ce_handler(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static void record_error_info(unsigned int cpu, struct error_info *err_info)
+{
+ switch (err_info->err_type) {
+ case CE:
+ {
+ struct queue_node *node = node_create(err_info->time, err_info->nums);
+
+ if (node == NULL) {
+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n");
+ return;
+ }
+ push(cpu_infos[cpu].ce_queue, node);
+ cpu_infos[cpu].ce_nums += err_info->nums;
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
+{
+ int ret;
+
+ if (enabled == 0)
+ return;
+
+ if (cpu >= ncores || cpu < 0) {
+ log(TERM, LOG_ERR,
+ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores);
+ return;
+ }
+
+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu);
+ cpu_infos[cpu].state = get_cpu_status(cpu);
+
+ if (cpu_infos[cpu].state != CPU_ONLINE) {
+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu);
+ return;
+ }
+
+ record_error_info(cpu, err_info);
+ /*
+ * Since user may change cpu state, we get current offlined
+ * cpu numbers every recording time.
+ */
+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) {
+ log(TERM, LOG_WARNING,
+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n",
+ cpu_limit.value);
+ return;
+ }
+
+ ret = error_handler(cpu, err_info);
+ if (ret == HANDLE_NOTHING)
+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu);
+ else if (ret == HANDLE_SUCCEED) {
+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n",
+ cpu, cpu_state[cpu_infos[cpu].state]);
+ clear_queue(cpu_infos[cpu].ce_queue);
+ cpu_infos[cpu].ce_nums = 0;
+ } else
+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
+ cpu, cpu_state[cpu_infos[cpu].state]);
+}
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
new file mode 100644
index 0000000..1159853
--- /dev/null
+++ b/ras-cpu-isolation.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAS_CPU_ISOLATION_H
+#define __RAS_CPU_ISOLATION_H
+
+#include "queue.h"
+
+#define MAX_PATH_LEN 100
+#define MAX_BUF_LEN 1024
+
+struct param {
+ char *name;
+ unsigned long value;
+};
+
+struct isolation_param {
+ char *name;
+ const struct param *units;
+ unsigned long value;
+ unsigned long limit;
+};
+
+enum cpu_state {
+ CPU_OFFLINE,
+ CPU_ONLINE,
+ CPU_OFFLINE_FAILED,
+ CPU_UNKNOWN,
+};
+
+enum error_handle_result {
+ HANDLE_FAILED = -1,
+ HANDLE_SUCCEED,
+ HANDLE_NOTHING,
+};
+
+enum error_type {
+ CE = 1
+};
+
+struct cpu_info {
+ unsigned long ce_nums;
+ struct link_queue *ce_queue;
+ enum cpu_state state;
+};
+
+struct error_info {
+ unsigned long nums;
+ time_t time;
+ enum error_type err_type;
+};
+
+void ras_cpu_isolation_init(unsigned int cpus);
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
+void cpu_infos_free(void);
+
+#endif
diff --git a/ras-events.c b/ras-events.c
index 39cab20..beda655 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -42,6 +42,7 @@
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
+#include "ras-cpu-isolation.h"
/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
@@ -856,6 +857,10 @@ int handle_ras_events(int record_events)
cpus = get_num_cpus(ras);
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ ras_cpu_isolation_init(cpus);
+#endif
+
#ifdef HAVE_MCE
rc = register_mce_handler(ras, cpus);
if (rc)
@@ -982,6 +987,8 @@ err:
}
free(ras);
}
-
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ cpu_infos_free();
+#endif
return rc;
}
--
2.25.1