!52 22.03为最新代码,master上补丁回合有问题,因此进行同步

From: @Lostwayzxc 
Reviewed-by: @overweight 
Signed-off-by: @overweight
This commit is contained in:
openeuler-ci-bot 2022-03-28 09:20:25 +00:00 committed by Gitee
commit 04297083dc
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
12 changed files with 1843 additions and 1185 deletions

View File

@ -0,0 +1,906 @@
From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001
From: Shengwei Luo <luoshengwei@huawei.com>
Date: Wed, 23 Feb 2022 17:21:58 +0800
Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors
When the corrected errors exceed the set limit in cycle, try to
offline the related cpu core.
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
---
Makefile.am | 6 +-
configure.ac | 11 ++
misc/rasdaemon.env | 17 ++
queue.c | 121 ++++++++++++++
queue.h | 39 +++++
ras-arm-handler.c | 84 ++++++++++
ras-arm-handler.h | 18 +++
ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++
ras-cpu-isolation.h | 68 ++++++++
ras-events.c | 9 +-
10 files changed, 749 insertions(+), 2 deletions(-)
create mode 100644 queue.c
create mode 100644 queue.h
create mode 100644 ras-cpu-isolation.c
create mode 100644 ras-cpu-isolation.h
diff --git a/Makefile.am b/Makefile.am
index fabca78..242ceb7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -63,13 +63,17 @@ endif
if WITH_AMP_NS_DECODE
rasdaemon_SOURCES += non-standard-ampere.c
endif
+if WITH_CPU_FAULT_ISOLATION
+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
+endif
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ ras-cpu-isolation.h queue.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index 33b81fe..d098fcf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
+AC_ARG_ENABLE([cpu_fault_isolation],
+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation]))
+
+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation")
+ AC_SUBST([WITH_CPU_FAULT_ISOLATION])
+])
+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
+
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
@@ -201,4 +211,5 @@ compile time options summary
Memory Failure : $USE_MEMORY_FAILURE
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION
EOF
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index 12fd766..7cb18e8 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50"
# soft-then-hard First try to soft offline, then try hard offlining.
# Note: default offline choice is "soft".
PAGE_CE_ACTION="soft"
+
+# CPU Online Fault Isolation
+# Whether to enable cpu online fault isolation (yes|no).
+CPU_ISOLATION_ENABLE="no"
+# Specify the threshold of CE numbers.
+#
+# Format:
+# [0-9]+[unit]
+#
+# Supported units:
+# CPU_CE_THRESHOLD: no unit
+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second
+CPU_CE_THRESHOLD="18"
+CPU_ISOLATION_CYCLE="24h"
+
+# Prevent excessive isolation from causing an avalanche effect
+CPU_ISOLATION_LIMIT="10"
\ No newline at end of file
diff --git a/queue.c b/queue.c
new file mode 100644
index 0000000..ed66798
--- /dev/null
+++ b/queue.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "queue.h"
+#include "ras-logger.h"
+
+int is_empty(struct link_queue *queue)
+{
+ if (queue)
+ return queue->size == 0;
+
+ return 1;
+}
+
+struct link_queue *init_queue(void)
+{
+ struct link_queue *queue = NULL;
+
+ queue = (struct link_queue *)malloc(sizeof(struct link_queue));
+
+ if (queue == NULL) {
+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n");
+ return NULL;
+ }
+
+ queue->size = 0;
+ queue->head = NULL;
+ queue->tail = NULL;
+
+ return queue;
+}
+
+void clear_queue(struct link_queue *queue)
+{
+ if (queue == NULL)
+ return;
+
+ struct queue_node *node = queue->head;
+ struct queue_node *tmp = NULL;
+
+ while (node != NULL) {
+ tmp = node;
+ node = node->next;
+ free(tmp);
+ }
+
+ queue->head = NULL;
+ queue->tail = NULL;
+ queue->size = 0;
+}
+
+void free_queue(struct link_queue *queue)
+{
+ clear_queue(queue);
+
+ if (queue)
+ free(queue);
+}
+
+/* It should be guranteed that the param is not NULL */
+void push(struct link_queue *queue, struct queue_node *node)
+{
+ /* there is no element in the queue */
+ if (queue->head == NULL)
+ queue->head = node;
+ else
+ queue->tail->next = node;
+
+ queue->tail = node;
+ (queue->size)++;
+}
+
+int pop(struct link_queue *queue)
+{
+ struct queue_node *tmp = NULL;
+
+ if (queue == NULL || is_empty(queue))
+ return -1;
+
+ tmp = queue->head;
+ queue->head = queue->head->next;
+ free(tmp);
+ (queue->size)--;
+
+ return 0;
+}
+
+struct queue_node *front(struct link_queue *queue)
+{
+ if (queue == NULL)
+ return NULL;
+
+ return queue->head;
+}
+
+struct queue_node *node_create(time_t time, unsigned int value)
+{
+ struct queue_node *node = NULL;
+
+ node = (struct queue_node *)malloc(sizeof(struct queue_node));
+
+ if (node != NULL) {
+ node->time = time;
+ node->value = value;
+ node->next = NULL;
+ }
+
+ return node;
+}
diff --git a/queue.h b/queue.h
new file mode 100644
index 0000000..5459f40
--- /dev/null
+++ b/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAS_QUEUE_H
+#define __RAS_QUEUE_H
+
+struct queue_node {
+ time_t time;
+ unsigned int value;
+ struct queue_node *next;
+};
+
+struct link_queue {
+ struct queue_node *head;
+ struct queue_node *tail;
+ int size;
+};
+
+int is_empty(struct link_queue *queue);
+struct link_queue *init_queue(void);
+void clear_queue(struct link_queue *queue);
+void free_queue(struct link_queue *queue);
+void push(struct link_queue *queue, struct queue_node *node);
+int pop(struct link_queue *queue);
+struct queue_node *front(struct link_queue *queue);
+struct queue_node *node_create(time_t time, unsigned int value);
+
+#endif
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 1149dc6..c9ef2fd 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -22,6 +22,10 @@
#include "ras-report.h"
#include "ras-non-standard-handler.h"
#include "non-standard-ampere.h"
+#include "ras-cpu-isolation.h"
+
+#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
+#define ARM_ERR_VALID_FLAGS BIT(1)
void display_raw_data(struct trace_seq *s,
const uint8_t *buf,
@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s,
}
}
+#ifdef HAVE_CPU_FAULT_ISOLATION
+static int count_errors(struct ras_arm_event *ev)
+{
+ struct ras_arm_err_info *err_info;
+ int num_pei;
+ int err_info_size = sizeof(struct ras_arm_err_info);
+ int num = 0;
+ int i;
+ int error_count;
+
+ if (ev->pei_len % err_info_size != 0) {
+ log(TERM, LOG_ERR,
+ "The event data does not match to the ARM Processor Error Information Structure\n");
+ return num;
+ }
+ num_pei = ev->pei_len / err_info_size;
+ err_info = (struct ras_arm_err_info *)(ev->pei_error);
+
+ for (i = 0; i < num_pei; ++i) {
+ error_count = 1;
+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
+ /*
+ * The value of this field is defined as follows:
+ * 0: Single Error
+ * 1: Multiple Errors
+ * 2-65535: Error Count
+ */
+ error_count = err_info->multiple_error + 1;
+ }
+
+ num += error_count;
+ err_info += 1;
+ }
+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num);
+ return num;
+}
+#endif
+
int ras_arm_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s,
display_raw_data(s, ev.vsei_error, ev.oem_len);
#endif
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ int cpu;
+ int nums;
+ char *severity;
+ struct error_info err_info;
+
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
+ return -1;
+ cpu = val;
+ trace_seq_printf(s, "\n cpu: %d", cpu);
+
+ /* record cpu error */
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
+ return -1;
+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */
+ switch (val) {
+ case GHES_SEV_NO:
+ severity = "Informational";
+ break;
+ case GHES_SEV_CORRECTED:
+ severity = "Corrected";
+ break;
+ case GHES_SEV_RECOVERABLE:
+ severity = "Recoverable";
+ break;
+ default:
+ case GHES_SEV_PANIC:
+ severity = "Fatal";
+ }
+ trace_seq_printf(s, "\n severity: %s", severity);
+
+ if (val == GHES_SEV_CORRECTED) {
+ nums = count_errors(&ev);
+ if (nums > 0) {
+ err_info.nums = nums;
+ err_info.time = now;
+ err_info.err_type = val;
+ ras_record_cpu_error(&err_info, cpu);
+ }
+ }
+#endif
+
/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_arm_record(ras, &ev);
diff --git a/ras-arm-handler.h b/ras-arm-handler.h
index 563a2d3..52813e7 100644
--- a/ras-arm-handler.h
+++ b/ras-arm-handler.h
@@ -17,6 +17,24 @@
#include "ras-events.h"
#include "libtrace/event-parse.h"
+/*
+ * ARM Processor Error Information Structure, According to
+ * UEFI_2_9 specification chapter N2.4.4.
+ */
+#pragma pack(1)
+struct ras_arm_err_info {
+ uint8_t version;
+ uint8_t length;
+ uint16_t validation_bits;
+ uint8_t type;
+ uint16_t multiple_error;
+ uint8_t flags;
+ uint64_t error_info;
+ uint64_t virt_fault_addr;
+ uint64_t physical_fault_addr;
+};
+#pragma pack()
+
int ras_arm_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context);
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
new file mode 100644
index 0000000..8c0cdf9
--- /dev/null
+++ b/ras-cpu-isolation.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <limits.h>
+#include <ctype.h>
+#include "ras-logger.h"
+#include "ras-cpu-isolation.h"
+
+static struct cpu_info *cpu_infos;
+static unsigned int ncores;
+static unsigned int enabled = 1;
+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
+
+static const struct param normal_units[] = {
+ {"", 1},
+ {}
+};
+
+static const struct param cycle_units[] = {
+ {"d", 24 * 60 * 60},
+ {"h", 60 * 60},
+ {"m", 60},
+ {"s", 1},
+ {}
+};
+
+static struct isolation_param threshold = {
+ .name = "CPU_CE_THRESHOLD",
+ .units = normal_units,
+ .value = 18,
+ .limit = 10000
+};
+
+static struct isolation_param cpu_limit = {
+ .name = "CPU_ISOLATION_LIMIT",
+ .units = normal_units
+};
+
+static struct isolation_param cycle = {
+ .name = "CPU_ISOLATION_CYCLE",
+ .units = cycle_units,
+ .value = 24 * 60 * 60,
+ .limit = 30 * 24 * 60 * 60
+};
+
+static const char * const cpu_state[] = {
+ [CPU_OFFLINE] = "offline",
+ [CPU_ONLINE] = "online",
+ [CPU_OFFLINE_FAILED] = "offline-failed",
+ [CPU_UNKNOWN] = "unknown"
+};
+
+static int open_sys_file(unsigned int cpu, int __oflag, const char *format)
+{
+ int fd;
+ char buf[MAX_PATH_LEN] = "";
+
+ snprintf(buf, sizeof(buf), format, cpu);
+ fd = open(buf, __oflag);
+
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int get_cpu_status(unsigned int cpu)
+{
+ int fd, num;
+ char buf[2] = "";
+
+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format);
+ if (fd == -1)
+ return CPU_UNKNOWN;
+
+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1)
+ num = CPU_UNKNOWN;
+
+ close(fd);
+
+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num;
+}
+
+static int init_cpu_info(unsigned int cpus)
+{
+ ncores = cpus;
+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus);
+
+ if (!cpu_infos) {
+ log(TERM, LOG_ERR,
+ "Failed to allocate memory for cpu infos in %s.\n", __func__);
+ return -1;
+ }
+
+ for (unsigned int i = 0; i < cpus; ++i) {
+ cpu_infos[i].ce_nums = 0;
+ cpu_infos[i].state = get_cpu_status(i);
+ cpu_infos[i].ce_queue = init_queue();
+
+ if (cpu_infos[i].ce_queue == NULL) {
+ log(TERM, LOG_ERR,
+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__);
+ return -1;
+ }
+ }
+ /* set limit of offlined cpu limit according to number of cpu */
+ cpu_limit.limit = cpus - 1;
+ cpu_limit.value = 0;
+
+ return 0;
+}
+
+static void check_config(struct isolation_param *config)
+{
+ if (config->value > config->limit) {
+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n",
+ config->value, config->limit);
+ config->value = config->limit;
+ }
+}
+
+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value)
+{
+ char *unit = NULL;
+ int env_size, has_unit = 0;
+
+ if (!env || strlen(env) == 0)
+ return -1;
+
+ env_size = strlen(env);
+ unit = env + env_size - 1;
+
+ if (isalpha(*unit)) {
+ has_unit = 1;
+ env_size--;
+ if (env_size <= 0)
+ return -1;
+ }
+
+ for (int i = 0; i < env_size; ++i) {
+ if (isdigit(env[i])) {
+ if (*value > ULONG_MAX / 10 ||
+ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) {
+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX);
+ return -1;
+ }
+ *value = 10 * (*value) + (env[i] - '0');
+ } else
+ return -1;
+ }
+
+ if (has_unit) {
+ for (const struct param *units = config->units; units->name; units++) {
+ /* value character and unit character are both valid */
+ if (!strcasecmp(unit, units->name)) {
+ if (*value > (ULONG_MAX / units->value)) {
+ log(TERM, LOG_ERR,
+ "%s is out of range: %lu\n", env, ULONG_MAX);
+ return -1;
+ }
+ *value = (*value) * units->value;
+ return 0;
+ }
+ }
+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void init_config(struct isolation_param *config)
+{
+ char *env = getenv(config->name);
+ unsigned long value = 0;
+
+ if (parse_ul_config(config, env, &value) < 0) {
+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n",
+ config->name, env, config->value);
+ return;
+ }
+
+ config->value = value;
+ check_config(config);
+}
+
+static int check_config_status(void)
+{
+ char *env = getenv("CPU_ISOLATION_ENABLE");
+
+ if (env == NULL || strcasecmp(env, "yes"))
+ return -1;
+
+ return 0;
+}
+
+void ras_cpu_isolation_init(unsigned int cpus)
+{
+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) {
+ enabled = 0;
+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n");
+ return;
+ }
+
+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n");
+ init_config(&threshold);
+ init_config(&cpu_limit);
+ init_config(&cycle);
+}
+
+void cpu_infos_free(void)
+{
+ if (cpu_infos) {
+ for (int i = 0; i < ncores; ++i)
+ free_queue(cpu_infos[i].ce_queue);
+
+ free(cpu_infos);
+ }
+}
+
+static int do_cpu_offline(unsigned int cpu)
+{
+ int fd, rc;
+ char buf[2] = "";
+
+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED;
+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format);
+ if (fd == -1)
+ return HANDLE_FAILED;
+
+ strcpy(buf, "0");
+ rc = write(fd, buf, strlen(buf));
+
+ if (rc < 0) {
+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno);
+ close(fd);
+ return HANDLE_FAILED;
+ }
+
+ close(fd);
+ /* check wthether the cpu is isolated successfully */
+ cpu_infos[cpu].state = get_cpu_status(cpu);
+
+ if (cpu_infos[cpu].state == CPU_OFFLINE)
+ return HANDLE_SUCCEED;
+
+ return HANDLE_FAILED;
+}
+
+static int do_ce_handler(unsigned int cpu)
+{
+ struct link_queue *queue = cpu_infos[cpu].ce_queue;
+ unsigned int tmp;
+ /*
+ * Since we just count all error numbers in setted cycle, we store the time
+ * and error numbers from current event to the queue, then everytime we
+ * calculate the period from beginning time to ending time, if the period
+ * exceeds setted cycle, we pop the beginning time and error until the period
+ * from new beginning time to ending time is less than cycle.
+ */
+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) {
+ tmp = queue->head->value;
+ if (pop(queue) == 0)
+ cpu_infos[cpu].ce_nums -= tmp;
+ }
+ log(TERM, LOG_INFO,
+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n",
+ cpu, cpu_infos[cpu].ce_nums);
+
+ if (cpu_infos[cpu].ce_nums >= threshold.value) {
+ log(TERM, LOG_INFO,
+ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n",
+ threshold.value, cpu);
+ return do_cpu_offline(cpu);
+ }
+ return HANDLE_NOTHING;
+}
+
+static int error_handler(unsigned int cpu, struct error_info *err_info)
+{
+ int ret = HANDLE_NOTHING;
+
+ switch (err_info->err_type) {
+ case CE:
+ ret = do_ce_handler(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static void record_error_info(unsigned int cpu, struct error_info *err_info)
+{
+ switch (err_info->err_type) {
+ case CE:
+ {
+ struct queue_node *node = node_create(err_info->time, err_info->nums);
+
+ if (node == NULL) {
+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n");
+ return;
+ }
+ push(cpu_infos[cpu].ce_queue, node);
+ cpu_infos[cpu].ce_nums += err_info->nums;
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
+{
+ int ret;
+
+ if (enabled == 0)
+ return;
+
+ if (cpu >= ncores || cpu < 0) {
+ log(TERM, LOG_ERR,
+ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
+ return;
+ }
+
+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu);
+ cpu_infos[cpu].state = get_cpu_status(cpu);
+
+ if (cpu_infos[cpu].state != CPU_ONLINE) {
+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu);
+ return;
+ }
+
+ record_error_info(cpu, err_info);
+ /*
+ * Since user may change cpu state, we get current offlined
+ * cpu numbers every recording time.
+ */
+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) {
+ log(TERM, LOG_WARNING,
+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n",
+ cpu_limit.value);
+ return;
+ }
+
+ ret = error_handler(cpu, err_info);
+
+ if (ret == HANDLE_NOTHING)
+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu);
+ else if (ret == HANDLE_SUCCEED) {
+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n",
+ cpu, cpu_state[cpu_infos[cpu].state]);
+ clear_queue(cpu_infos[cpu].ce_queue);
+ cpu_infos[cpu].ce_nums = 0;
+ } else
+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
+ cpu, cpu_state[cpu_infos[cpu].state]);
+}
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
new file mode 100644
index 0000000..1159853
--- /dev/null
+++ b/ras-cpu-isolation.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAS_CPU_ISOLATION_H
+#define __RAS_CPU_ISOLATION_H
+
+#include "queue.h"
+
+#define MAX_PATH_LEN 100
+#define MAX_BUF_LEN 1024
+
+struct param {
+ char *name;
+ unsigned long value;
+};
+
+struct isolation_param {
+ char *name;
+ const struct param *units;
+ unsigned long value;
+ unsigned long limit;
+};
+
+enum cpu_state {
+ CPU_OFFLINE,
+ CPU_ONLINE,
+ CPU_OFFLINE_FAILED,
+ CPU_UNKNOWN,
+};
+
+enum error_handle_result {
+ HANDLE_FAILED = -1,
+ HANDLE_SUCCEED,
+ HANDLE_NOTHING,
+};
+
+enum error_type {
+ CE = 1
+};
+
+struct cpu_info {
+ unsigned long ce_nums;
+ struct link_queue *ce_queue;
+ enum cpu_state state;
+};
+
+struct error_info {
+ unsigned long nums;
+ time_t time;
+ enum error_type err_type;
+};
+
+void ras_cpu_isolation_init(unsigned int cpus);
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
+void cpu_infos_free(void);
+
+#endif
diff --git a/ras-events.c b/ras-events.c
index ba769d1..491c17a 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -41,6 +41,7 @@
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
+#include "ras-cpu-isolation.h"
/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
@@ -879,6 +880,10 @@ int handle_ras_events(int record_events)
cpus = get_num_cpus(ras);
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ ras_cpu_isolation_init(cpus);
+#endif
+
#ifdef HAVE_MCE
rc = register_mce_handler(ras, cpus);
if (rc)
@@ -1005,6 +1010,8 @@ err:
}
free(ras);
}
-
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ cpu_infos_free();
+#endif
return rc;
}
--
2.27.0

View File

@ -0,0 +1,224 @@
From 62218a9c3aec44330ce3b77f3634c788b6e6f60c Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Wed, 2 Mar 2022 12:20:40 +0000
Subject: [PATCH 1/6] rasdaemon: Modify recording Hisilicon common error data
The error statistics for the Hisilicon common
error need to do based on module, error severity etc.
Modify recording Hisilicon common error data as separate fields
in the sql db table instead of the combined single field.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
non-standard-hisilicon.c | 122 ++++++++++++++++++++++++++++++++-------
1 file changed, 102 insertions(+), 20 deletions(-)
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
index 1432163..dc69d46 100644
--- a/non-standard-hisilicon.c
+++ b/non-standard-hisilicon.c
@@ -17,6 +17,7 @@
#include "non-standard-hisilicon.h"
#define HISI_BUF_LEN 2048
+#define HISI_PCIE_INFO_BUF_LEN 256
struct hisi_common_error_section {
uint32_t val_bits;
@@ -63,12 +64,25 @@ enum {
enum {
HISI_COMMON_FIELD_ID,
HISI_COMMON_FIELD_TIMESTAMP,
- HISI_COMMON_FIELD_ERR_INFO,
+ HISI_COMMON_FIELD_VERSION,
+ HISI_COMMON_FIELD_SOC_ID,
+ HISI_COMMON_FIELD_SOCKET_ID,
+ HISI_COMMON_FIELD_TOTEM_ID,
+ HISI_COMMON_FIELD_NIMBUS_ID,
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
+ HISI_COMMON_FIELD_MODULE_ID,
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
+ HISI_COMMON_FIELD_CORE_ID,
+ HISI_COMMON_FIELD_PORT_ID,
+ HISI_COMMON_FIELD_ERR_TYPE,
+ HISI_COMMON_FIELD_PCIE_INFO,
+ HISI_COMMON_FIELD_ERR_SEVERITY,
HISI_COMMON_FIELD_REGS_DUMP,
};
struct hisi_event {
char error_msg[HISI_BUF_LEN];
+ char pcie_info[HISI_PCIE_INFO_BUF_LEN];
char reg_msg[HISI_BUF_LEN];
};
@@ -134,12 +148,24 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
static const struct db_fields hisi_common_section_fields[] = {
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
{ .name = "timestamp", .type = "TEXT" },
- { .name = "err_info", .type = "TEXT" },
+ { .name = "version", .type = "INTEGER" },
+ { .name = "soc_id", .type = "INTEGER" },
+ { .name = "socket_id", .type = "INTEGER" },
+ { .name = "totem_id", .type = "INTEGER" },
+ { .name = "nimbus_id", .type = "INTEGER" },
+ { .name = "sub_system_id", .type = "INTEGER" },
+ { .name = "module_id", .type = "TEXT" },
+ { .name = "sub_module_id", .type = "INTEGER" },
+ { .name = "core_id", .type = "INTEGER" },
+ { .name = "port_id", .type = "INTEGER" },
+ { .name = "err_type", .type = "INTEGER" },
+ { .name = "pcie_info", .type = "TEXT" },
+ { .name = "err_severity", .type = "TEXT" },
{ .name = "regs_dump", .type = "TEXT" },
};
static const struct db_table_descriptor hisi_common_section_tab = {
- .name = "hisi_common_section",
+ .name = "hisi_common_section_v2",
.fields = hisi_common_section_fields,
.num_fields = ARRAY_SIZE(hisi_common_section_fields),
};
@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id)
return soc_desc[soc_id];
}
-static void decode_module(struct hisi_event *event, uint8_t module_id)
+static void decode_module(struct ras_ns_ev_decoder *ev_decoder,
+ struct hisi_event *event, uint8_t module_id)
{
- if (module_id >= sizeof(module_name)/sizeof(char *))
+ if (module_id >= sizeof(module_name)/sizeof(char *)) {
HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id);
- else
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_MODULE_ID,
+ 0, "unknown");
+ } else {
HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_MODULE_ID,
+ 0, module_name[module_id]);
+ }
}
static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
struct hisi_event *event)
{
HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version);
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID))
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_VERSION,
+ err->version, NULL);
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) {
HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id));
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_SOC_ID,
+ err->soc_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) {
HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_SOCKET_ID,
+ err->socket_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) {
HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_TOTEM_ID,
+ err->totem_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) {
HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_NIMBUS_ID,
+ err->nimbus_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) {
HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
+ err->subsystem_id, NULL);
+ }
if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID))
- decode_module(event, err->module_id);
+ decode_module(ev_decoder, event, err->module_id);
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) {
HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
+ err->submodule_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) {
HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_CORE_ID,
+ err->core_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) {
HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_PORT_ID,
+ err->port_id, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) {
HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
+ HISI_COMMON_FIELD_ERR_TYPE,
+ err->err_type, NULL);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) {
HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x",
err->pcie_info.segment, err->pcie_info.bus,
err->pcie_info.device, err->pcie_info.function);
+ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x",
+ err->pcie_info.segment, err->pcie_info.bus,
+ err->pcie_info.device, err->pcie_info.function);
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_PCIE_INFO,
+ 0, event->pcie_info);
+ }
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY))
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) {
HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity));
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_ERR_SEVERITY,
+ 0, err_severity(err->err_severity));
+ }
HISI_SNPRINTF(event->error_msg, "]");
}
@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras,
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HISI_COMMON_FIELD_TIMESTAMP,
0, event->timestamp);
- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
--
2.25.1

View File

@ -0,0 +1,138 @@
From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001
From: Shengwei Luo <luoshengwei@huawei.com>
Date: Wed, 23 Feb 2022 17:23:27 +0800
Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors
When the recoverable errors in cpu core occurred, try to offline
the related cpu core.
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
---
ras-arm-handler.c | 21 ++++++++++++++++++---
ras-cpu-isolation.c | 17 +++++++++++++++++
ras-cpu-isolation.h | 4 +++-
3 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index c9ef2fd..dae5ad6 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s,
}
#ifdef HAVE_CPU_FAULT_ISOLATION
-static int count_errors(struct ras_arm_event *ev)
+static int is_core_failure(struct ras_arm_err_info *err_info)
+{
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
+ /*
+ * core failure:
+ * Bit 0\1\3: (at lease 1)
+ * Bit 2: 0
+ */
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2));
+ }
+ return 0;
+}
+
+static int count_errors(struct ras_arm_event *ev, int sev)
{
struct ras_arm_err_info *err_info;
int num_pei;
@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev)
*/
error_count = err_info->multiple_error + 1;
}
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
+ error_count = 0;
num += error_count;
err_info += 1;
@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s,
}
trace_seq_printf(s, "\n severity: %s", severity);
- if (val == GHES_SEV_CORRECTED) {
- nums = count_errors(&ev);
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
+ nums = count_errors(&ev, val);
if (nums > 0) {
err_info.nums = nums;
err_info.time = now;
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index 8c0cdf9..e650022 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus)
for (unsigned int i = 0; i < cpus; ++i) {
cpu_infos[i].ce_nums = 0;
+ cpu_infos[i].uce_nums = 0;
cpu_infos[i].state = get_cpu_status(i);
cpu_infos[i].ce_queue = init_queue();
@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu)
return HANDLE_NOTHING;
}
+static int do_uce_handler(unsigned int cpu)
+{
+ if (cpu_infos[cpu].uce_nums > 0) {
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu);
+ return do_cpu_offline(cpu);
+ }
+ return HANDLE_NOTHING;
+}
+
static int error_handler(unsigned int cpu, struct error_info *err_info)
{
int ret = HANDLE_NOTHING;
@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
case CE:
ret = do_ce_handler(cpu);
break;
+ case UCE:
+ ret = do_uce_handler(cpu);
+ break;
default:
break;
}
@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
cpu_infos[cpu].ce_nums += err_info->nums;
break;
}
+ case UCE:
+ cpu_infos[cpu].uce_nums++;
+ break;
default:
break;
}
@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
cpu, cpu_state[cpu_infos[cpu].state]);
clear_queue(cpu_infos[cpu].ce_queue);
cpu_infos[cpu].ce_nums = 0;
+ cpu_infos[cpu].uce_nums = 0;
} else
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
cpu, cpu_state[cpu_infos[cpu].state]);
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
index 1159853..024a68b 100644
--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
@@ -46,10 +46,12 @@ enum error_handle_result {
};
enum error_type {
- CE = 1
+ CE = 1,
+ UCE
};
struct cpu_info {
+ unsigned long uce_nums;
unsigned long ce_nums;
struct link_queue *ce_queue;
enum cpu_state state;
--
2.27.0

View File

@ -0,0 +1,97 @@
From 4d9f297028ce3116eaf574b2570d71a4ed666b7d Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Thu, 24 Feb 2022 18:02:14 +0000
Subject: [PATCH 2/6] rasdaemon: ras-mc-ctl: Modify error statistics for
HiSilicon Kunpeng9xx common errors
Modify the error statistics for the HiSilicon Kunpeng9xx platforms common errors
to display the statistics and error info based on the module and the error severity.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 11 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..22ba1fd 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1535,7 +1535,7 @@ sub vendor_errors_summary
require DBI;
my ($num_args, $platform_id);
my ($query, $query_handle, $count, $out);
- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info);
+ my ($module_id, $sub_module_id, $err_severity, $err_sev);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1612,13 +1612,18 @@ sub vendor_errors_summary
# HiSilicon Kunpeng9xx common errors
if ($platform_id eq HISILICON_KUNPENG_9XX) {
- $query = "select err_info, count(*) from hisi_common_section";
+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($err_info, $count));
+ $query_handle->bind_columns(\($err_severity, $module_id, $count));
$out = "";
+ $err_sev = "";
while($query_handle->fetch()) {
- $out .= "\terrors: $count\n";
+ if ($err_severity ne $err_sev) {
+ $out .= "$err_severity errors:\n";
+ $err_sev = $err_severity;
+ }
+ $out .= "\t$module_id: $count\n";
}
if ($out ne "") {
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
@@ -1636,8 +1641,8 @@ sub vendor_errors
require DBI;
my ($num_args, $platform_id);
my ($query, $query_handle, $id, $timestamp, $out);
- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id);
- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs);
+ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
+ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
$num_args = $#ARGV + 1;
$platform_id = 0;
@@ -1725,15 +1730,28 @@ sub vendor_errors
# HiSilicon Kunpeng9xx common errors
if ($platform_id eq HISILICON_KUNPENG_9XX) {
- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id";
+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs));
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
$out = "";
while($query_handle->fetch()) {
- $out .= "$id. $timestamp ";
- $out .= "Error Info:$err_info \n" if ($err_info);
- $out .= "Error Registers: $regs\n\n" if ($regs);
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "totem_id=$totem_id, " if ($totem_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "core_id=$core_id, " if ($core_id);
+ $out .= "port_id=$port_id, " if ($port_id);
+ $out .= "err_type=$err_type, " if ($err_type);
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs" if ($regs);
+ $out .= "\n\n";
}
if ($out ne "") {
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
--
2.25.1

View File

@ -0,0 +1,56 @@
From eb93d77b417b58cba27799ae85747b8a193cf063 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Sat, 5 Mar 2022 16:18:55 +0000
Subject: [PATCH 3/6] rasdaemon: ras-mc-ctl: Reformat error info of the
HiSilicon Kunpeng920
Reformat the code to display the error info of HiSilicon Kunpeng920.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 22ba1fd..eeaf885 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1669,8 +1669,9 @@ sub vendor_errors
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
$out .= "module_id=$module_id, " if ($module_id);
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
- $out .= "Error Registers: $regs\n\n" if ($regs);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
}
if ($out ne "") {
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
@@ -1692,8 +1693,9 @@ sub vendor_errors
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
$out .= "module_id=$module_id, " if ($module_id);
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
- $out .= "Error Registers: $regs\n\n" if ($regs);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
}
if ($out ne "") {
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
@@ -1717,8 +1719,9 @@ sub vendor_errors
$out .= "core_id=$core_id, " if ($core_id);
$out .= "port_id=$port_id, " if ($port_id);
$out .= "err_severity=$err_severity, " if ($err_severity);
- $out .= "err_type=$err_type, \n" if ($err_type);
- $out .= "Error Registers: $regs\n\n" if ($regs);
+ $out .= "err_type=$err_type, " if ($err_type);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
}
if ($out ne "") {
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
--
2.25.1

View File

@ -0,0 +1,36 @@
From 623e85c07ab21ccc89ffe2bb444eb000a2664a9d Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Sat, 5 Mar 2022 17:01:35 +0000
Subject: [PATCH 4/6] rasdaemon: ras-mc-ctl: Add printing usage if necessary
parameters are not passed for the HiSilicon vendor-errors options
Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options of the ras-mc-ctl.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 2 ++
1 file changed, 2 insertions(+)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index eeaf885..0e32cb1 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1542,6 +1542,7 @@ sub vendor_errors_summary
if ($num_args ne 0) {
$platform_id = $ARGV[0];
} else {
+ usage(1);
return;
}
@@ -1649,6 +1650,7 @@ sub vendor_errors
if ($num_args ne 0) {
$platform_id = $ARGV[0];
} else {
+ usage(1);
return;
}
--
2.25.1

View File

@ -0,0 +1,198 @@
From 4007c95f8a8d570542ffc11676b619ea5649d0e7 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Sat, 5 Mar 2022 18:19:38 +0000
Subject: [PATCH 5/6] rasdaemon: ras-mc-ctl: Add support to display the
HiSilicon vendor errors for a specified module
Add support to display the HiSilicon vendor errors for a specified module.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 119 ++++++++++++++++++++++++---------------------
1 file changed, 63 insertions(+), 56 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 0e32cb1..d728300 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -96,7 +96,8 @@ Usage: $prog [OPTIONS...]
--errors Shows the errors stored at the error database.
--error-count Shows the corrected and uncorrected error counts using sysfs.
--vendor-errors-summary <platform-id> Presents a summary of the vendor-specific logged errors.
- --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
+ --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
+ --vendor-errors <platform-id> <module-name> Shows the vendor-specific errors for a specific module stored in the error database.
--vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors.
--help This help message.
EOF
@@ -1640,15 +1641,19 @@ sub vendor_errors_summary
sub vendor_errors
{
require DBI;
- my ($num_args, $platform_id);
+ my ($num_args, $platform_id, $module);
my ($query, $query_handle, $id, $timestamp, $out);
my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
$num_args = $#ARGV + 1;
$platform_id = 0;
+ $module = 0;
if ($num_args ne 0) {
$platform_id = $ARGV[0];
+ if ($num_args gt 1) {
+ $module = $ARGV[1];
+ }
} else {
usage(1);
return;
@@ -1664,21 +1669,21 @@ sub vendor_errors
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
$out = "";
while($query_handle->fetch()) {
- $out .= "$id. $timestamp Error Info: ";
- $out .= "version=$version, ";
- $out .= "soc_id=$soc_id, " if ($soc_id);
- $out .= "socket_id=$socket_id, " if ($socket_id);
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
- $out .= "module_id=$module_id, " if ($module_id);
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "err_severity=$err_severity, " if ($err_severity);
- $out .= "Error Registers: $regs " if ($regs);
- $out .= "\n\n";
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
+ }
}
if ($out ne "") {
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n";
}
$query_handle->finish;
@@ -1688,21 +1693,21 @@ sub vendor_errors
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
$out = "";
while($query_handle->fetch()) {
- $out .= "$id. $timestamp Error Info: ";
- $out .= "version=$version, ";
- $out .= "soc_id=$soc_id, " if ($soc_id);
- $out .= "socket_id=$socket_id, " if ($socket_id);
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
- $out .= "module_id=$module_id, " if ($module_id);
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "err_severity=$err_severity, " if ($err_severity);
- $out .= "Error Registers: $regs " if ($regs);
- $out .= "\n\n";
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
+ }
}
if ($out ne "") {
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n";
}
$query_handle->finish;
@@ -1712,23 +1717,23 @@ sub vendor_errors
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs));
$out = "";
while($query_handle->fetch()) {
- $out .= "$id. $timestamp Error Info: ";
- $out .= "version=$version, ";
- $out .= "soc_id=$soc_id, " if ($soc_id);
- $out .= "socket_id=$socket_id, " if ($socket_id);
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "core_id=$core_id, " if ($core_id);
- $out .= "port_id=$port_id, " if ($port_id);
- $out .= "err_severity=$err_severity, " if ($err_severity);
- $out .= "err_type=$err_type, " if ($err_type);
- $out .= "Error Registers: $regs " if ($regs);
- $out .= "\n\n";
+ if ($module eq 0 || ($sub_module_id && ($module eq $sub_module_id))) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "core_id=$core_id, " if ($core_id);
+ $out .= "port_id=$port_id, " if ($port_id);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "err_type=$err_type, " if ($err_type);
+ $out .= "Error Registers: $regs " if ($regs);
+ $out .= "\n\n";
+ }
}
if ($out ne "") {
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n";
}
$query_handle->finish;
}
@@ -1741,22 +1746,24 @@ sub vendor_errors
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
$out = "";
while($query_handle->fetch()) {
- $out .= "$id. $timestamp Error Info: ";
- $out .= "version=$version, ";
- $out .= "soc_id=$soc_id, " if ($soc_id);
- $out .= "socket_id=$socket_id, " if ($socket_id);
- $out .= "totem_id=$totem_id, " if ($totem_id);
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
- $out .= "module_id=$module_id, " if ($module_id);
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
- $out .= "core_id=$core_id, " if ($core_id);
- $out .= "port_id=$port_id, " if ($port_id);
- $out .= "err_type=$err_type, " if ($err_type);
- $out .= "pcie_info=$pcie_info, " if ($pcie_info);
- $out .= "err_severity=$err_severity, " if ($err_severity);
- $out .= "Error Registers: $regs" if ($regs);
- $out .= "\n\n";
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
+ $out .= "$id. $timestamp Error Info: ";
+ $out .= "version=$version, ";
+ $out .= "soc_id=$soc_id, " if ($soc_id);
+ $out .= "socket_id=$socket_id, " if ($socket_id);
+ $out .= "totem_id=$totem_id, " if ($totem_id);
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
+ $out .= "module_id=$module_id, " if ($module_id);
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
+ $out .= "core_id=$core_id, " if ($core_id);
+ $out .= "port_id=$port_id, " if ($port_id);
+ $out .= "err_type=$err_type, " if ($err_type);
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
+ $out .= "err_severity=$err_severity, " if ($err_severity);
+ $out .= "Error Registers: $regs" if ($regs);
+ $out .= "\n\n";
+ }
}
if ($out ne "") {
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
--
2.25.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,148 @@
From 88bf3126312645843152c6c3215b54b120bcc1ec Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Mon, 7 Mar 2022 12:38:45 +0000
Subject: [PATCH 6/6] rasdaemon: ras-mc-ctl: Relocate reading and display
Kunpeng920 errors to under Kunpeng9xx
Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
util/ras-mc-ctl.in | 38 ++++++++++----------------------------
1 file changed, 10 insertions(+), 28 deletions(-)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index d728300..2ab9602 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1527,7 +1527,6 @@ sub errors
# Definitions of the vendor platform IDs.
use constant {
- HISILICON_KUNPENG_920 => "Kunpeng920",
HISILICON_KUNPENG_9XX => "Kunpeng9xx",
};
@@ -1549,8 +1548,8 @@ sub vendor_errors_summary
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
- # HiSilicon Kunpeng920 errors
- if ($platform_id eq HISILICON_KUNPENG_920) {
+ # HiSilicon Kunpeng9xx common errors
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
$query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
@@ -1565,9 +1564,7 @@ sub vendor_errors_summary
$out .= "\t$module_id: $count\n";
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n";
+ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n";
}
$query_handle->finish;
@@ -1585,9 +1582,7 @@ sub vendor_errors_summary
$out .= "\t$module_id: $count\n";
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n";
+ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n";
}
$query_handle->finish;
@@ -1605,15 +1600,10 @@ sub vendor_errors_summary
$out .= "\t$sub_module_id: $count\n";
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n";
+ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n";
}
$query_handle->finish;
- }
- # HiSilicon Kunpeng9xx common errors
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
$query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
@@ -1629,8 +1619,6 @@ sub vendor_errors_summary
}
if ($out ne "") {
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng9xx common errors.\n\n";
}
$query_handle->finish;
}
@@ -1661,8 +1649,8 @@ sub vendor_errors
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
- # HiSilicon Kunpeng920 errors
- if ($platform_id eq HISILICON_KUNPENG_920) {
+ # HiSilicon Kunpeng9xx common errors
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
$query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
@@ -1683,7 +1671,7 @@ sub vendor_errors
}
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
+ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n";
}
$query_handle->finish;
@@ -1707,7 +1695,7 @@ sub vendor_errors
}
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
+ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n";
}
$query_handle->finish;
@@ -1733,13 +1721,10 @@ sub vendor_errors
}
}
if ($out ne "") {
- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
+ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n";
}
$query_handle->finish;
- }
- # HiSilicon Kunpeng9xx common errors
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
$query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
@@ -1767,8 +1752,6 @@ sub vendor_errors
}
if ($out ne "") {
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
- } else {
- print "No HiSilicon Kunpeng9xx common errors.\n";
}
$query_handle->finish;
}
@@ -1779,7 +1762,6 @@ sub vendor_errors
sub vendor_platforms
{
print "\nSupported platforms for the vendor-specific errors:\n";
- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n";
print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
print "\n";
}
--
2.25.1

View File

@ -1,78 +0,0 @@
From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001
From: Lostwayzxc <luoshengwei@huawei.com>
Date: Tue, 25 May 2021 20:07:26 +0800
Subject: [PATCH 2/2] add trace print of new information and add it to sqilte
Since we add new information of the event, we add trace print and store it to
Sqlite.
Signed-off-by: Luo Shengwei <luoshengwei@huawei.com>
---
ras-arm-handler.c | 10 ++++++++++
ras-record.c | 8 ++++++++
2 files changed, 18 insertions(+)
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 10d0099..23ad470 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -23,6 +23,13 @@
#include "ras-cpu-isolation.h"
#ifdef HAVE_CPU_FAULT_ISOLATION
+static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len)
+{
+ for (int i = 0; i < buf_len; ++i) {
+ trace_seq_printf(s, "%2.2x", buf[i]);
+ }
+}
+
static int is_core_failure(unsigned long value)
{
/*
@@ -135,6 +142,7 @@ int ras_arm_event_handler(struct trace_seq *s,
case GHES_SEV_PANIC:
ev.severity = "Fatal";
}
+ trace_seq_printf(s, "\n severity: %s", ev.severity);
if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
int len, nums;
@@ -142,6 +150,8 @@ int ras_arm_event_handler(struct trace_seq *s,
if (!ev.error_info)
return -1;
ev.length = len;
+ trace_seq_printf(s, "\n processor_err_info: ");
+ trace_print_hex(s, ev.error_info, len);
/* relate to enum error_type */
nums = count_errors(event, ev.error_info, len);
if (nums > 0) {
diff --git a/ras-record.c b/ras-record.c
index 549c494..33d4741 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = {
{ .name="err_info", .type="BLOB" },
{ .name="context_info", .type="BLOB" },
{ .name="vendor_info", .type="BLOB" },
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ { .name="severity", .type="TEXT" },
+ { .name="error_info", .type="BLOB" },
+#endif
};
static const struct db_table_descriptor arm_event_tab = {
@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
ev->ctx_error, ev->ctx_len, NULL);
sqlite3_bind_blob (priv->stmt_arm_record, 9,
ev->vsei_error, ev->oem_len, NULL);
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL);
+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL);
+#endif
rc = sqlite3_step(priv->stmt_arm_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
--
2.27.0

View File

@ -1,60 +0,0 @@
From 6b767a2fce615384f062ecb392cd332452bf4482 Mon Sep 17 00:00:00 2001
From: Lostwayzxc <luoshengwei@huawei.com>
Date: Wed, 1 Sep 2021 21:00:16 +0800
Subject: [PATCH] modify cpu parse for adapting to new bios version
---
ras-cpu-isolation.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index 6dcff70..b1643c4 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -25,6 +25,7 @@
static struct cpu_info *cpu_infos = NULL;
static unsigned int ncores, cores_per_socket, cores_per_die;
+static unsigned int cores_per_cluster = 4;
static unsigned int sockets, dies = 1;
static unsigned int enabled = 1;
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
@@ -432,18 +433,33 @@ static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size
static unsigned get_cpu_index(int64_t mpidr)
{
- unsigned core_id, socket_id, die_id, cpu;
+ unsigned core_id, cluster_id, socket_id, die_id, cpu;
/*
* Adapt to certain BIOS
* In the MPIDR:
* bit 8:15: core id
+ * bit 16:18: cluster id
* bit 19:20: die_id
* bit 21:22: socket_id
*/
core_id = get_bit_value(mpidr, 8, 8);
+ cluster_id = get_bit_value(mpidr, 16, 3);
socket_id = get_bit_value(mpidr, 21, 2);
die_id = get_bit_value(mpidr, 19, 2);
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die;
+
+ /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
+ * it means TotemB. When cores per die equal to cores per socket, it means
+ * that there is only one die in the socket, in case that the only die is
+ * TotemB in CPU 1620s, we set die id to 0 directly.
+ */
+ if (cores_per_die == cores_per_socket) {
+ die_id = 0;
+ }
+ else {
+ die_id = (die_id == 1 ? 0:1);
+ }
+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
+ cluster_id * cores_per_cluster;
return cpu;
}
--
2.27.0

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.7
Release: 1
Release: 4
License: GPLv2
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
URL: https://github.com/mchehab/rasdaemon.git
@ -23,13 +23,18 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch
Patch2: bugfix-fix-fd-check.patch
Patch3: bugfix-fix-disk-error-log-storm.patch
Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
Patch5: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
Patch6: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
Patch7: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
Patch8: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
Patch9: 0006-add-cpu-online-fault-isolation.patch
Patch10: 0007-add-trace-print-and-add-sqlite-store.patch
Patch11: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch
Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch
Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
Patch11: 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
Patch12: 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch
Patch13: 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch
Patch14: 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch
Patch15: 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
Patch16: 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
%description
The rasdaemon program is a daemon which monitors the platform
@ -75,41 +80,43 @@ rm INSTALL %{buildroot}/usr/include/*.h
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
%changelog
* Mon Jan 17 2022 xujing<xujing99@huawei.com> - 0.6.7-1
- DESC: Update software to v0.6.7
* Thu Dec 9 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-10
* Mon Mar 07 2022 Shiju Jose<shiju.jose@huawei.com> - 0.6.7-4
- Type:feature
- ID:NA
- SUG:NA
- DESC: Enable compilation of the feature memory fault prediction based on
- DESC:
1. Modify recording Hisilicon common error data in the rasdaemon and
2. In the ras-mc-ctl,
2.1. Improve Hisilicon common error statistics.
2.2. Add support to display the HiSilicon vendor-errors for a specified module.
2.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
2.4. Reformat error info of the HiSilicon Kunpeng920.
2.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
* Wed Mar 2 2022 tanxiaofei<tanxiaofei@huawei.com> - 0.6.7-3
- Type:bugfix
- ID:NA
- SUG:NA
- DESC:
1. Backport 4 patches from openEuler master branch.
1) Fix the issue of sprintf data type mismatch in uuid_le()
2) Fix the issue of command option -r for hip08
3) Fix some print format issues for hisi common error section
4) Add some modules supported by hisi common error section
2.Enable compilation of the feature memory fault prediction based on
corrected error.
3.Fix changelog date error of this spec file.
* Thu Dec 2 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-9
* Wed Feb 23 2022 luoshengwei<luoshengwei@huawei.com> - 0.6.7-2
- Type:feature
- ID:NA
- SUG:NA
- DESC: Backport memory failure feature, one patch.
- DESC: Add cpu online fault isolation for arm event.
* Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-8
- Type:feature
- ID:NA
- SUG:NA
- DESC: Sync three patches, add cpu online fault isolation.
* Wed Dec 8 2021 xujing <xujing99@huawei.com> - 0.6.7-1
- Update software to v0.6.7
* Wed Oct 20 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-7
- Type:Bugfix
- ID:NA
- SUG:NA
- DESC: Backport one patch, and some little fixes and add some modules
support for kunpeng series:
1. Modify non-standard error decoding interface using linked list
2. Fix the issue of sprintf data type mismatch in uuid_le()
3. Fix the issue of command option -r for hip08
4. Fix some print format issues for hisi common error section
5. Add some modules supported by hisi common error section
* Sat July 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-6
* Thu Jul 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-6
- Type:feature
- ID:NA
- SUG:NA