!52 22.03为最新代码,master上补丁回合有问题,因此进行同步
From: @Lostwayzxc Reviewed-by: @overweight Signed-off-by: @overweight
This commit is contained in:
commit
04297083dc
906
0001-Support-cpu-fault-isolation-for-corrected-errors.patch
Normal file
906
0001-Support-cpu-fault-isolation-for-corrected-errors.patch
Normal file
@ -0,0 +1,906 @@
|
||||
From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001
|
||||
From: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Date: Wed, 23 Feb 2022 17:21:58 +0800
|
||||
Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors
|
||||
|
||||
When the corrected errors exceed the set limit in cycle, try to
|
||||
offline the related cpu core.
|
||||
|
||||
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
|
||||
---
|
||||
Makefile.am | 6 +-
|
||||
configure.ac | 11 ++
|
||||
misc/rasdaemon.env | 17 ++
|
||||
queue.c | 121 ++++++++++++++
|
||||
queue.h | 39 +++++
|
||||
ras-arm-handler.c | 84 ++++++++++
|
||||
ras-arm-handler.h | 18 +++
|
||||
ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-cpu-isolation.h | 68 ++++++++
|
||||
ras-events.c | 9 +-
|
||||
10 files changed, 749 insertions(+), 2 deletions(-)
|
||||
create mode 100644 queue.c
|
||||
create mode 100644 queue.h
|
||||
create mode 100644 ras-cpu-isolation.c
|
||||
create mode 100644 ras-cpu-isolation.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index fabca78..242ceb7 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -63,13 +63,17 @@ endif
|
||||
if WITH_AMP_NS_DECODE
|
||||
rasdaemon_SOURCES += non-standard-ampere.c
|
||||
endif
|
||||
+if WITH_CPU_FAULT_ISOLATION
|
||||
+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
|
||||
+endif
|
||||
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
|
||||
|
||||
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
|
||||
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
|
||||
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
|
||||
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
|
||||
- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
|
||||
+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
|
||||
+ ras-cpu-isolation.h queue.h
|
||||
|
||||
# This rule can't be called with more than one Makefile job (like make -j8)
|
||||
# I can't figure out a way to fix that
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 33b81fe..d098fcf 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
|
||||
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
|
||||
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
|
||||
|
||||
+AC_ARG_ENABLE([cpu_fault_isolation],
|
||||
+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation]))
|
||||
+
|
||||
+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [
|
||||
+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation")
|
||||
+ AC_SUBST([WITH_CPU_FAULT_ISOLATION])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
|
||||
+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
|
||||
+
|
||||
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||||
|
||||
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||||
@@ -201,4 +211,5 @@ compile time options summary
|
||||
Memory Failure : $USE_MEMORY_FAILURE
|
||||
Memory CE PFA : $USE_MEMORY_CE_PFA
|
||||
AMP RAS errors : $USE_AMP_NS_DECODE
|
||||
+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION
|
||||
EOF
|
||||
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
|
||||
index 12fd766..7cb18e8 100644
|
||||
--- a/misc/rasdaemon.env
|
||||
+++ b/misc/rasdaemon.env
|
||||
@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50"
|
||||
# soft-then-hard First try to soft offline, then try hard offlining.
|
||||
# Note: default offline choice is "soft".
|
||||
PAGE_CE_ACTION="soft"
|
||||
+
|
||||
+# CPU Online Fault Isolation
|
||||
+# Whether to enable cpu online fault isolation (yes|no).
|
||||
+CPU_ISOLATION_ENABLE="no"
|
||||
+# Specify the threshold of CE numbers.
|
||||
+#
|
||||
+# Format:
|
||||
+# [0-9]+[unit]
|
||||
+#
|
||||
+# Supported units:
|
||||
+# CPU_CE_THRESHOLD: no unit
|
||||
+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second
|
||||
+CPU_CE_THRESHOLD="18"
|
||||
+CPU_ISOLATION_CYCLE="24h"
|
||||
+
|
||||
+# Prevent excessive isolation from causing an avalanche effect
|
||||
+CPU_ISOLATION_LIMIT="10"
|
||||
\ No newline at end of file
|
||||
diff --git a/queue.c b/queue.c
|
||||
new file mode 100644
|
||||
index 0000000..ed66798
|
||||
--- /dev/null
|
||||
+++ b/queue.c
|
||||
@@ -0,0 +1,121 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include "queue.h"
|
||||
+#include "ras-logger.h"
|
||||
+
|
||||
+int is_empty(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue)
|
||||
+ return queue->size == 0;
|
||||
+
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+struct link_queue *init_queue(void)
|
||||
+{
|
||||
+ struct link_queue *queue = NULL;
|
||||
+
|
||||
+ queue = (struct link_queue *)malloc(sizeof(struct link_queue));
|
||||
+
|
||||
+ if (queue == NULL) {
|
||||
+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n");
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ queue->size = 0;
|
||||
+ queue->head = NULL;
|
||||
+ queue->tail = NULL;
|
||||
+
|
||||
+ return queue;
|
||||
+}
|
||||
+
|
||||
+void clear_queue(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue == NULL)
|
||||
+ return;
|
||||
+
|
||||
+ struct queue_node *node = queue->head;
|
||||
+ struct queue_node *tmp = NULL;
|
||||
+
|
||||
+ while (node != NULL) {
|
||||
+ tmp = node;
|
||||
+ node = node->next;
|
||||
+ free(tmp);
|
||||
+ }
|
||||
+
|
||||
+ queue->head = NULL;
|
||||
+ queue->tail = NULL;
|
||||
+ queue->size = 0;
|
||||
+}
|
||||
+
|
||||
+void free_queue(struct link_queue *queue)
|
||||
+{
|
||||
+ clear_queue(queue);
|
||||
+
|
||||
+ if (queue)
|
||||
+ free(queue);
|
||||
+}
|
||||
+
|
||||
+/* It should be guranteed that the param is not NULL */
|
||||
+void push(struct link_queue *queue, struct queue_node *node)
|
||||
+{
|
||||
+ /* there is no element in the queue */
|
||||
+ if (queue->head == NULL)
|
||||
+ queue->head = node;
|
||||
+ else
|
||||
+ queue->tail->next = node;
|
||||
+
|
||||
+ queue->tail = node;
|
||||
+ (queue->size)++;
|
||||
+}
|
||||
+
|
||||
+int pop(struct link_queue *queue)
|
||||
+{
|
||||
+ struct queue_node *tmp = NULL;
|
||||
+
|
||||
+ if (queue == NULL || is_empty(queue))
|
||||
+ return -1;
|
||||
+
|
||||
+ tmp = queue->head;
|
||||
+ queue->head = queue->head->next;
|
||||
+ free(tmp);
|
||||
+ (queue->size)--;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+struct queue_node *front(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue == NULL)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return queue->head;
|
||||
+}
|
||||
+
|
||||
+struct queue_node *node_create(time_t time, unsigned int value)
|
||||
+{
|
||||
+ struct queue_node *node = NULL;
|
||||
+
|
||||
+ node = (struct queue_node *)malloc(sizeof(struct queue_node));
|
||||
+
|
||||
+ if (node != NULL) {
|
||||
+ node->time = time;
|
||||
+ node->value = value;
|
||||
+ node->next = NULL;
|
||||
+ }
|
||||
+
|
||||
+ return node;
|
||||
+}
|
||||
diff --git a/queue.h b/queue.h
|
||||
new file mode 100644
|
||||
index 0000000..5459f40
|
||||
--- /dev/null
|
||||
+++ b/queue.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_QUEUE_H
|
||||
+#define __RAS_QUEUE_H
|
||||
+
|
||||
+struct queue_node {
|
||||
+ time_t time;
|
||||
+ unsigned int value;
|
||||
+ struct queue_node *next;
|
||||
+};
|
||||
+
|
||||
+struct link_queue {
|
||||
+ struct queue_node *head;
|
||||
+ struct queue_node *tail;
|
||||
+ int size;
|
||||
+};
|
||||
+
|
||||
+int is_empty(struct link_queue *queue);
|
||||
+struct link_queue *init_queue(void);
|
||||
+void clear_queue(struct link_queue *queue);
|
||||
+void free_queue(struct link_queue *queue);
|
||||
+void push(struct link_queue *queue, struct queue_node *node);
|
||||
+int pop(struct link_queue *queue);
|
||||
+struct queue_node *front(struct link_queue *queue);
|
||||
+struct queue_node *node_create(time_t time, unsigned int value);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 1149dc6..c9ef2fd 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -22,6 +22,10 @@
|
||||
#include "ras-report.h"
|
||||
#include "ras-non-standard-handler.h"
|
||||
#include "non-standard-ampere.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
+
|
||||
+#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
|
||||
+#define ARM_ERR_VALID_FLAGS BIT(1)
|
||||
|
||||
void display_raw_data(struct trace_seq *s,
|
||||
const uint8_t *buf,
|
||||
@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s,
|
||||
}
|
||||
}
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+static int count_errors(struct ras_arm_event *ev)
|
||||
+{
|
||||
+ struct ras_arm_err_info *err_info;
|
||||
+ int num_pei;
|
||||
+ int err_info_size = sizeof(struct ras_arm_err_info);
|
||||
+ int num = 0;
|
||||
+ int i;
|
||||
+ int error_count;
|
||||
+
|
||||
+ if (ev->pei_len % err_info_size != 0) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "The event data does not match to the ARM Processor Error Information Structure\n");
|
||||
+ return num;
|
||||
+ }
|
||||
+ num_pei = ev->pei_len / err_info_size;
|
||||
+ err_info = (struct ras_arm_err_info *)(ev->pei_error);
|
||||
+
|
||||
+ for (i = 0; i < num_pei; ++i) {
|
||||
+ error_count = 1;
|
||||
+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
|
||||
+ /*
|
||||
+ * The value of this field is defined as follows:
|
||||
+ * 0: Single Error
|
||||
+ * 1: Multiple Errors
|
||||
+ * 2-65535: Error Count
|
||||
+ */
|
||||
+ error_count = err_info->multiple_error + 1;
|
||||
+ }
|
||||
+
|
||||
+ num += error_count;
|
||||
+ err_info += 1;
|
||||
+ }
|
||||
+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num);
|
||||
+ return num;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context)
|
||||
@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
display_raw_data(s, ev.vsei_error, ev.oem_len);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ int cpu;
|
||||
+ int nums;
|
||||
+ char *severity;
|
||||
+ struct error_info err_info;
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ cpu = val;
|
||||
+ trace_seq_printf(s, "\n cpu: %d", cpu);
|
||||
+
|
||||
+ /* record cpu error */
|
||||
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */
|
||||
+ switch (val) {
|
||||
+ case GHES_SEV_NO:
|
||||
+ severity = "Informational";
|
||||
+ break;
|
||||
+ case GHES_SEV_CORRECTED:
|
||||
+ severity = "Corrected";
|
||||
+ break;
|
||||
+ case GHES_SEV_RECOVERABLE:
|
||||
+ severity = "Recoverable";
|
||||
+ break;
|
||||
+ default:
|
||||
+ case GHES_SEV_PANIC:
|
||||
+ severity = "Fatal";
|
||||
+ }
|
||||
+ trace_seq_printf(s, "\n severity: %s", severity);
|
||||
+
|
||||
+ if (val == GHES_SEV_CORRECTED) {
|
||||
+ nums = count_errors(&ev);
|
||||
+ if (nums > 0) {
|
||||
+ err_info.nums = nums;
|
||||
+ err_info.time = now;
|
||||
+ err_info.err_type = val;
|
||||
+ ras_record_cpu_error(&err_info, cpu);
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
/* Insert data into the SGBD */
|
||||
#ifdef HAVE_SQLITE3
|
||||
ras_store_arm_record(ras, &ev);
|
||||
diff --git a/ras-arm-handler.h b/ras-arm-handler.h
|
||||
index 563a2d3..52813e7 100644
|
||||
--- a/ras-arm-handler.h
|
||||
+++ b/ras-arm-handler.h
|
||||
@@ -17,6 +17,24 @@
|
||||
#include "ras-events.h"
|
||||
#include "libtrace/event-parse.h"
|
||||
|
||||
+/*
|
||||
+ * ARM Processor Error Information Structure, According to
|
||||
+ * UEFI_2_9 specification chapter N2.4.4.
|
||||
+ */
|
||||
+#pragma pack(1)
|
||||
+struct ras_arm_err_info {
|
||||
+ uint8_t version;
|
||||
+ uint8_t length;
|
||||
+ uint16_t validation_bits;
|
||||
+ uint8_t type;
|
||||
+ uint16_t multiple_error;
|
||||
+ uint8_t flags;
|
||||
+ uint64_t error_info;
|
||||
+ uint64_t virt_fault_addr;
|
||||
+ uint64_t physical_fault_addr;
|
||||
+};
|
||||
+#pragma pack()
|
||||
+
|
||||
int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context);
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
new file mode 100644
|
||||
index 0000000..8c0cdf9
|
||||
--- /dev/null
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -0,0 +1,378 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <fcntl.h>
|
||||
+#include <errno.h>
|
||||
+#include <unistd.h>
|
||||
+#include <limits.h>
|
||||
+#include <ctype.h>
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
+
|
||||
+static struct cpu_info *cpu_infos;
|
||||
+static unsigned int ncores;
|
||||
+static unsigned int enabled = 1;
|
||||
+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
+
|
||||
+static const struct param normal_units[] = {
|
||||
+ {"", 1},
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static const struct param cycle_units[] = {
|
||||
+ {"d", 24 * 60 * 60},
|
||||
+ {"h", 60 * 60},
|
||||
+ {"m", 60},
|
||||
+ {"s", 1},
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param threshold = {
|
||||
+ .name = "CPU_CE_THRESHOLD",
|
||||
+ .units = normal_units,
|
||||
+ .value = 18,
|
||||
+ .limit = 10000
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param cpu_limit = {
|
||||
+ .name = "CPU_ISOLATION_LIMIT",
|
||||
+ .units = normal_units
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param cycle = {
|
||||
+ .name = "CPU_ISOLATION_CYCLE",
|
||||
+ .units = cycle_units,
|
||||
+ .value = 24 * 60 * 60,
|
||||
+ .limit = 30 * 24 * 60 * 60
|
||||
+};
|
||||
+
|
||||
+static const char * const cpu_state[] = {
|
||||
+ [CPU_OFFLINE] = "offline",
|
||||
+ [CPU_ONLINE] = "online",
|
||||
+ [CPU_OFFLINE_FAILED] = "offline-failed",
|
||||
+ [CPU_UNKNOWN] = "unknown"
|
||||
+};
|
||||
+
|
||||
+static int open_sys_file(unsigned int cpu, int __oflag, const char *format)
|
||||
+{
|
||||
+ int fd;
|
||||
+ char buf[MAX_PATH_LEN] = "";
|
||||
+
|
||||
+ snprintf(buf, sizeof(buf), format, cpu);
|
||||
+ fd = open(buf, __oflag);
|
||||
+
|
||||
+ if (fd == -1) {
|
||||
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return fd;
|
||||
+}
|
||||
+
|
||||
+static int get_cpu_status(unsigned int cpu)
|
||||
+{
|
||||
+ int fd, num;
|
||||
+ char buf[2] = "";
|
||||
+
|
||||
+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format);
|
||||
+ if (fd == -1)
|
||||
+ return CPU_UNKNOWN;
|
||||
+
|
||||
+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1)
|
||||
+ num = CPU_UNKNOWN;
|
||||
+
|
||||
+ close(fd);
|
||||
+
|
||||
+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num;
|
||||
+}
|
||||
+
|
||||
+static int init_cpu_info(unsigned int cpus)
|
||||
+{
|
||||
+ ncores = cpus;
|
||||
+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus);
|
||||
+
|
||||
+ if (!cpu_infos) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to allocate memory for cpu infos in %s.\n", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ for (unsigned int i = 0; i < cpus; ++i) {
|
||||
+ cpu_infos[i].ce_nums = 0;
|
||||
+ cpu_infos[i].state = get_cpu_status(i);
|
||||
+ cpu_infos[i].ce_queue = init_queue();
|
||||
+
|
||||
+ if (cpu_infos[i].ce_queue == NULL) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+ /* set limit of offlined cpu limit according to number of cpu */
|
||||
+ cpu_limit.limit = cpus - 1;
|
||||
+ cpu_limit.value = 0;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void check_config(struct isolation_param *config)
|
||||
+{
|
||||
+ if (config->value > config->limit) {
|
||||
+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n",
|
||||
+ config->value, config->limit);
|
||||
+ config->value = config->limit;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value)
|
||||
+{
|
||||
+ char *unit = NULL;
|
||||
+ int env_size, has_unit = 0;
|
||||
+
|
||||
+ if (!env || strlen(env) == 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ env_size = strlen(env);
|
||||
+ unit = env + env_size - 1;
|
||||
+
|
||||
+ if (isalpha(*unit)) {
|
||||
+ has_unit = 1;
|
||||
+ env_size--;
|
||||
+ if (env_size <= 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < env_size; ++i) {
|
||||
+ if (isdigit(env[i])) {
|
||||
+ if (*value > ULONG_MAX / 10 ||
|
||||
+ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) {
|
||||
+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ *value = 10 * (*value) + (env[i] - '0');
|
||||
+ } else
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if (has_unit) {
|
||||
+ for (const struct param *units = config->units; units->name; units++) {
|
||||
+ /* value character and unit character are both valid */
|
||||
+ if (!strcasecmp(unit, units->name)) {
|
||||
+ if (*value > (ULONG_MAX / units->value)) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "%s is out of range: %lu\n", env, ULONG_MAX);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ *value = (*value) * units->value;
|
||||
+ return 0;
|
||||
+ }
|
||||
+ }
|
||||
+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void init_config(struct isolation_param *config)
|
||||
+{
|
||||
+ char *env = getenv(config->name);
|
||||
+ unsigned long value = 0;
|
||||
+
|
||||
+ if (parse_ul_config(config, env, &value) < 0) {
|
||||
+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n",
|
||||
+ config->name, env, config->value);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ config->value = value;
|
||||
+ check_config(config);
|
||||
+}
|
||||
+
|
||||
+static int check_config_status(void)
|
||||
+{
|
||||
+ char *env = getenv("CPU_ISOLATION_ENABLE");
|
||||
+
|
||||
+ if (env == NULL || strcasecmp(env, "yes"))
|
||||
+ return -1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+void ras_cpu_isolation_init(unsigned int cpus)
|
||||
+{
|
||||
+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) {
|
||||
+ enabled = 0;
|
||||
+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n");
|
||||
+ init_config(&threshold);
|
||||
+ init_config(&cpu_limit);
|
||||
+ init_config(&cycle);
|
||||
+}
|
||||
+
|
||||
+void cpu_infos_free(void)
|
||||
+{
|
||||
+ if (cpu_infos) {
|
||||
+ for (int i = 0; i < ncores; ++i)
|
||||
+ free_queue(cpu_infos[i].ce_queue);
|
||||
+
|
||||
+ free(cpu_infos);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int do_cpu_offline(unsigned int cpu)
|
||||
+{
|
||||
+ int fd, rc;
|
||||
+ char buf[2] = "";
|
||||
+
|
||||
+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED;
|
||||
+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format);
|
||||
+ if (fd == -1)
|
||||
+ return HANDLE_FAILED;
|
||||
+
|
||||
+ strcpy(buf, "0");
|
||||
+ rc = write(fd, buf, strlen(buf));
|
||||
+
|
||||
+ if (rc < 0) {
|
||||
+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno);
|
||||
+ close(fd);
|
||||
+ return HANDLE_FAILED;
|
||||
+ }
|
||||
+
|
||||
+ close(fd);
|
||||
+ /* check wthether the cpu is isolated successfully */
|
||||
+ cpu_infos[cpu].state = get_cpu_status(cpu);
|
||||
+
|
||||
+ if (cpu_infos[cpu].state == CPU_OFFLINE)
|
||||
+ return HANDLE_SUCCEED;
|
||||
+
|
||||
+ return HANDLE_FAILED;
|
||||
+}
|
||||
+
|
||||
+static int do_ce_handler(unsigned int cpu)
|
||||
+{
|
||||
+ struct link_queue *queue = cpu_infos[cpu].ce_queue;
|
||||
+ unsigned int tmp;
|
||||
+ /*
|
||||
+ * Since we just count all error numbers in setted cycle, we store the time
|
||||
+ * and error numbers from current event to the queue, then everytime we
|
||||
+ * calculate the period from beginning time to ending time, if the period
|
||||
+ * exceeds setted cycle, we pop the beginning time and error until the period
|
||||
+ * from new beginning time to ending time is less than cycle.
|
||||
+ */
|
||||
+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) {
|
||||
+ tmp = queue->head->value;
|
||||
+ if (pop(queue) == 0)
|
||||
+ cpu_infos[cpu].ce_nums -= tmp;
|
||||
+ }
|
||||
+ log(TERM, LOG_INFO,
|
||||
+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n",
|
||||
+ cpu, cpu_infos[cpu].ce_nums);
|
||||
+
|
||||
+ if (cpu_infos[cpu].ce_nums >= threshold.value) {
|
||||
+ log(TERM, LOG_INFO,
|
||||
+ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n",
|
||||
+ threshold.value, cpu);
|
||||
+ return do_cpu_offline(cpu);
|
||||
+ }
|
||||
+ return HANDLE_NOTHING;
|
||||
+}
|
||||
+
|
||||
+static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
+{
|
||||
+ int ret = HANDLE_NOTHING;
|
||||
+
|
||||
+ switch (err_info->err_type) {
|
||||
+ case CE:
|
||||
+ ret = do_ce_handler(cpu);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static void record_error_info(unsigned int cpu, struct error_info *err_info)
|
||||
+{
|
||||
+ switch (err_info->err_type) {
|
||||
+ case CE:
|
||||
+ {
|
||||
+ struct queue_node *node = node_create(err_info->time, err_info->nums);
|
||||
+
|
||||
+ if (node == NULL) {
|
||||
+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n");
|
||||
+ return;
|
||||
+ }
|
||||
+ push(cpu_infos[cpu].ce_queue, node);
|
||||
+ cpu_infos[cpu].ce_nums += err_info->nums;
|
||||
+ break;
|
||||
+ }
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
+{
|
||||
+ int ret;
|
||||
+
|
||||
+ if (enabled == 0)
|
||||
+ return;
|
||||
+
|
||||
+ if (cpu >= ncores || cpu < 0) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu);
|
||||
+ cpu_infos[cpu].state = get_cpu_status(cpu);
|
||||
+
|
||||
+ if (cpu_infos[cpu].state != CPU_ONLINE) {
|
||||
+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ record_error_info(cpu, err_info);
|
||||
+ /*
|
||||
+ * Since user may change cpu state, we get current offlined
|
||||
+ * cpu numbers every recording time.
|
||||
+ */
|
||||
+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) {
|
||||
+ log(TERM, LOG_WARNING,
|
||||
+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n",
|
||||
+ cpu_limit.value);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ ret = error_handler(cpu, err_info);
|
||||
+
|
||||
+ if (ret == HANDLE_NOTHING)
|
||||
+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu);
|
||||
+ else if (ret == HANDLE_SUCCEED) {
|
||||
+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n",
|
||||
+ cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
+ clear_queue(cpu_infos[cpu].ce_queue);
|
||||
+ cpu_infos[cpu].ce_nums = 0;
|
||||
+ } else
|
||||
+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
|
||||
+ cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
+}
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
new file mode 100644
|
||||
index 0000000..1159853
|
||||
--- /dev/null
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -0,0 +1,68 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_CPU_ISOLATION_H
|
||||
+#define __RAS_CPU_ISOLATION_H
|
||||
+
|
||||
+#include "queue.h"
|
||||
+
|
||||
+#define MAX_PATH_LEN 100
|
||||
+#define MAX_BUF_LEN 1024
|
||||
+
|
||||
+struct param {
|
||||
+ char *name;
|
||||
+ unsigned long value;
|
||||
+};
|
||||
+
|
||||
+struct isolation_param {
|
||||
+ char *name;
|
||||
+ const struct param *units;
|
||||
+ unsigned long value;
|
||||
+ unsigned long limit;
|
||||
+};
|
||||
+
|
||||
+enum cpu_state {
|
||||
+ CPU_OFFLINE,
|
||||
+ CPU_ONLINE,
|
||||
+ CPU_OFFLINE_FAILED,
|
||||
+ CPU_UNKNOWN,
|
||||
+};
|
||||
+
|
||||
+enum error_handle_result {
|
||||
+ HANDLE_FAILED = -1,
|
||||
+ HANDLE_SUCCEED,
|
||||
+ HANDLE_NOTHING,
|
||||
+};
|
||||
+
|
||||
+enum error_type {
|
||||
+ CE = 1
|
||||
+};
|
||||
+
|
||||
+struct cpu_info {
|
||||
+ unsigned long ce_nums;
|
||||
+ struct link_queue *ce_queue;
|
||||
+ enum cpu_state state;
|
||||
+};
|
||||
+
|
||||
+struct error_info {
|
||||
+ unsigned long nums;
|
||||
+ time_t time;
|
||||
+ enum error_type err_type;
|
||||
+};
|
||||
+
|
||||
+void ras_cpu_isolation_init(unsigned int cpus);
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
|
||||
+void cpu_infos_free(void);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index ba769d1..491c17a 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
#include "ras-page-isolation.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
|
||||
/*
|
||||
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
|
||||
@@ -879,6 +880,10 @@ int handle_ras_events(int record_events)
|
||||
|
||||
cpus = get_num_cpus(ras);
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ ras_cpu_isolation_init(cpus);
|
||||
+#endif
|
||||
+
|
||||
#ifdef HAVE_MCE
|
||||
rc = register_mce_handler(ras, cpus);
|
||||
if (rc)
|
||||
@@ -1005,6 +1010,8 @@ err:
|
||||
}
|
||||
free(ras);
|
||||
}
|
||||
-
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ cpu_infos_free();
|
||||
+#endif
|
||||
return rc;
|
||||
}
|
||||
--
|
||||
2.27.0
|
||||
|
||||
224
0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
Normal file
224
0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
Normal file
@ -0,0 +1,224 @@
|
||||
From 62218a9c3aec44330ce3b77f3634c788b6e6f60c Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Wed, 2 Mar 2022 12:20:40 +0000
|
||||
Subject: [PATCH 1/6] rasdaemon: Modify recording Hisilicon common error data
|
||||
|
||||
The error statistics for the Hisilicon common
|
||||
error need to do based on module, error severity etc.
|
||||
|
||||
Modify recording Hisilicon common error data as separate fields
|
||||
in the sql db table instead of the combined single field.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
non-standard-hisilicon.c | 122 ++++++++++++++++++++++++++++++++-------
|
||||
1 file changed, 102 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 1432163..dc69d46 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "non-standard-hisilicon.h"
|
||||
|
||||
#define HISI_BUF_LEN 2048
|
||||
+#define HISI_PCIE_INFO_BUF_LEN 256
|
||||
|
||||
struct hisi_common_error_section {
|
||||
uint32_t val_bits;
|
||||
@@ -63,12 +64,25 @@ enum {
|
||||
enum {
|
||||
HISI_COMMON_FIELD_ID,
|
||||
HISI_COMMON_FIELD_TIMESTAMP,
|
||||
- HISI_COMMON_FIELD_ERR_INFO,
|
||||
+ HISI_COMMON_FIELD_VERSION,
|
||||
+ HISI_COMMON_FIELD_SOC_ID,
|
||||
+ HISI_COMMON_FIELD_SOCKET_ID,
|
||||
+ HISI_COMMON_FIELD_TOTEM_ID,
|
||||
+ HISI_COMMON_FIELD_NIMBUS_ID,
|
||||
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
|
||||
+ HISI_COMMON_FIELD_CORE_ID,
|
||||
+ HISI_COMMON_FIELD_PORT_ID,
|
||||
+ HISI_COMMON_FIELD_ERR_TYPE,
|
||||
+ HISI_COMMON_FIELD_PCIE_INFO,
|
||||
+ HISI_COMMON_FIELD_ERR_SEVERITY,
|
||||
HISI_COMMON_FIELD_REGS_DUMP,
|
||||
};
|
||||
|
||||
struct hisi_event {
|
||||
char error_msg[HISI_BUF_LEN];
|
||||
+ char pcie_info[HISI_PCIE_INFO_BUF_LEN];
|
||||
char reg_msg[HISI_BUF_LEN];
|
||||
};
|
||||
|
||||
@@ -134,12 +148,24 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
|
||||
static const struct db_fields hisi_common_section_fields[] = {
|
||||
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
{ .name = "timestamp", .type = "TEXT" },
|
||||
- { .name = "err_info", .type = "TEXT" },
|
||||
+ { .name = "version", .type = "INTEGER" },
|
||||
+ { .name = "soc_id", .type = "INTEGER" },
|
||||
+ { .name = "socket_id", .type = "INTEGER" },
|
||||
+ { .name = "totem_id", .type = "INTEGER" },
|
||||
+ { .name = "nimbus_id", .type = "INTEGER" },
|
||||
+ { .name = "sub_system_id", .type = "INTEGER" },
|
||||
+ { .name = "module_id", .type = "TEXT" },
|
||||
+ { .name = "sub_module_id", .type = "INTEGER" },
|
||||
+ { .name = "core_id", .type = "INTEGER" },
|
||||
+ { .name = "port_id", .type = "INTEGER" },
|
||||
+ { .name = "err_type", .type = "INTEGER" },
|
||||
+ { .name = "pcie_info", .type = "TEXT" },
|
||||
+ { .name = "err_severity", .type = "TEXT" },
|
||||
{ .name = "regs_dump", .type = "TEXT" },
|
||||
};
|
||||
|
||||
static const struct db_table_descriptor hisi_common_section_tab = {
|
||||
- .name = "hisi_common_section",
|
||||
+ .name = "hisi_common_section_v2",
|
||||
.fields = hisi_common_section_fields,
|
||||
.num_fields = ARRAY_SIZE(hisi_common_section_fields),
|
||||
};
|
||||
@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id)
|
||||
return soc_desc[soc_id];
|
||||
}
|
||||
|
||||
-static void decode_module(struct hisi_event *event, uint8_t module_id)
|
||||
+static void decode_module(struct ras_ns_ev_decoder *ev_decoder,
|
||||
+ struct hisi_event *event, uint8_t module_id)
|
||||
{
|
||||
- if (module_id >= sizeof(module_name)/sizeof(char *))
|
||||
+ if (module_id >= sizeof(module_name)/sizeof(char *)) {
|
||||
HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id);
|
||||
- else
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ 0, "unknown");
|
||||
+ } else {
|
||||
HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ 0, module_name[module_id]);
|
||||
+ }
|
||||
}
|
||||
|
||||
static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct hisi_event *event)
|
||||
{
|
||||
HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version);
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID))
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_VERSION,
|
||||
+ err->version, NULL);
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id));
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SOC_ID,
|
||||
+ err->soc_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SOCKET_ID,
|
||||
+ err->socket_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_TOTEM_ID,
|
||||
+ err->totem_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_NIMBUS_ID,
|
||||
+ err->nimbus_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
|
||||
+ err->subsystem_id, NULL);
|
||||
+ }
|
||||
|
||||
if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID))
|
||||
- decode_module(event, err->module_id);
|
||||
+ decode_module(ev_decoder, event, err->module_id);
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
|
||||
+ err->submodule_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_CORE_ID,
|
||||
+ err->core_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_PORT_ID,
|
||||
+ err->port_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) {
|
||||
HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_ERR_TYPE,
|
||||
+ err->err_type, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) {
|
||||
HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x",
|
||||
err->pcie_info.segment, err->pcie_info.bus,
|
||||
err->pcie_info.device, err->pcie_info.function);
|
||||
+ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x",
|
||||
+ err->pcie_info.segment, err->pcie_info.bus,
|
||||
+ err->pcie_info.device, err->pcie_info.function);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_PCIE_INFO,
|
||||
+ 0, event->pcie_info);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) {
|
||||
HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity));
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_ERR_SEVERITY,
|
||||
+ 0, err_severity(err->err_severity));
|
||||
+ }
|
||||
|
||||
HISI_SNPRINTF(event->error_msg, "]");
|
||||
}
|
||||
@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
|
||||
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||||
--
|
||||
2.25.1
|
||||
|
||||
138
0002-Support-cpu-fault-isolation-for-recoverable-errors.patch
Normal file
138
0002-Support-cpu-fault-isolation-for-recoverable-errors.patch
Normal file
@ -0,0 +1,138 @@
|
||||
From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001
|
||||
From: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Date: Wed, 23 Feb 2022 17:23:27 +0800
|
||||
Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors
|
||||
|
||||
When the recoverable errors in cpu core occurred, try to offline
|
||||
the related cpu core.
|
||||
|
||||
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
|
||||
---
|
||||
ras-arm-handler.c | 21 ++++++++++++++++++---
|
||||
ras-cpu-isolation.c | 17 +++++++++++++++++
|
||||
ras-cpu-isolation.h | 4 +++-
|
||||
3 files changed, 38 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index c9ef2fd..dae5ad6 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s,
|
||||
}
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
-static int count_errors(struct ras_arm_event *ev)
|
||||
+static int is_core_failure(struct ras_arm_err_info *err_info)
|
||||
+{
|
||||
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
|
||||
+ /*
|
||||
+ * core failure:
|
||||
+ * Bit 0\1\3: (at lease 1)
|
||||
+ * Bit 2: 0
|
||||
+ */
|
||||
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2));
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int count_errors(struct ras_arm_event *ev, int sev)
|
||||
{
|
||||
struct ras_arm_err_info *err_info;
|
||||
int num_pei;
|
||||
@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev)
|
||||
*/
|
||||
error_count = err_info->multiple_error + 1;
|
||||
}
|
||||
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
|
||||
+ error_count = 0;
|
||||
|
||||
num += error_count;
|
||||
err_info += 1;
|
||||
@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
}
|
||||
trace_seq_printf(s, "\n severity: %s", severity);
|
||||
|
||||
- if (val == GHES_SEV_CORRECTED) {
|
||||
- nums = count_errors(&ev);
|
||||
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
|
||||
+ nums = count_errors(&ev, val);
|
||||
if (nums > 0) {
|
||||
err_info.nums = nums;
|
||||
err_info.time = now;
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index 8c0cdf9..e650022 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus)
|
||||
|
||||
for (unsigned int i = 0; i < cpus; ++i) {
|
||||
cpu_infos[i].ce_nums = 0;
|
||||
+ cpu_infos[i].uce_nums = 0;
|
||||
cpu_infos[i].state = get_cpu_status(i);
|
||||
cpu_infos[i].ce_queue = init_queue();
|
||||
|
||||
@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu)
|
||||
return HANDLE_NOTHING;
|
||||
}
|
||||
|
||||
+static int do_uce_handler(unsigned int cpu)
|
||||
+{
|
||||
+ if (cpu_infos[cpu].uce_nums > 0) {
|
||||
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu);
|
||||
+ return do_cpu_offline(cpu);
|
||||
+ }
|
||||
+ return HANDLE_NOTHING;
|
||||
+}
|
||||
+
|
||||
static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
{
|
||||
int ret = HANDLE_NOTHING;
|
||||
@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
case CE:
|
||||
ret = do_ce_handler(cpu);
|
||||
break;
|
||||
+ case UCE:
|
||||
+ ret = do_uce_handler(cpu);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
|
||||
cpu_infos[cpu].ce_nums += err_info->nums;
|
||||
break;
|
||||
}
|
||||
+ case UCE:
|
||||
+ cpu_infos[cpu].uce_nums++;
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
clear_queue(cpu_infos[cpu].ce_queue);
|
||||
cpu_infos[cpu].ce_nums = 0;
|
||||
+ cpu_infos[cpu].uce_nums = 0;
|
||||
} else
|
||||
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
|
||||
cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
index 1159853..024a68b 100644
|
||||
--- a/ras-cpu-isolation.h
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -46,10 +46,12 @@ enum error_handle_result {
|
||||
};
|
||||
|
||||
enum error_type {
|
||||
- CE = 1
|
||||
+ CE = 1,
|
||||
+ UCE
|
||||
};
|
||||
|
||||
struct cpu_info {
|
||||
+ unsigned long uce_nums;
|
||||
unsigned long ce_nums;
|
||||
struct link_queue *ce_queue;
|
||||
enum cpu_state state;
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -0,0 +1,97 @@
|
||||
From 4d9f297028ce3116eaf574b2570d71a4ed666b7d Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Thu, 24 Feb 2022 18:02:14 +0000
|
||||
Subject: [PATCH 2/6] rasdaemon: ras-mc-ctl: Modify error statistics for
|
||||
HiSilicon Kunpeng9xx common errors
|
||||
|
||||
Modify the error statistics for the HiSilicon Kunpeng9xx platforms common errors
|
||||
to display the statistics and error info based on the module and the error severity.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 29 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 1e3aeb7..22ba1fd 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1535,7 +1535,7 @@ sub vendor_errors_summary
|
||||
require DBI;
|
||||
my ($num_args, $platform_id);
|
||||
my ($query, $query_handle, $count, $out);
|
||||
- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info);
|
||||
+ my ($module_id, $sub_module_id, $err_severity, $err_sev);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
@@ -1612,13 +1612,18 @@ sub vendor_errors_summary
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $query = "select err_info, count(*) from hisi_common_section";
|
||||
+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($err_info, $count));
|
||||
+ $query_handle->bind_columns(\($err_severity, $module_id, $count));
|
||||
$out = "";
|
||||
+ $err_sev = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "\terrors: $count\n";
|
||||
+ if ($err_severity ne $err_sev) {
|
||||
+ $out .= "$err_severity errors:\n";
|
||||
+ $err_sev = $err_severity;
|
||||
+ }
|
||||
+ $out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
|
||||
@@ -1636,8 +1641,8 @@ sub vendor_errors
|
||||
require DBI;
|
||||
my ($num_args, $platform_id);
|
||||
my ($query, $query_handle, $id, $timestamp, $out);
|
||||
- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id);
|
||||
- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs);
|
||||
+ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
|
||||
+ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
@@ -1725,15 +1730,28 @@ sub vendor_errors
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id";
|
||||
+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs));
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp ";
|
||||
- $out .= "Error Info:$err_info \n" if ($err_info);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs" if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
From eb93d77b417b58cba27799ae85747b8a193cf063 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 16:18:55 +0000
|
||||
Subject: [PATCH 3/6] rasdaemon: ras-mc-ctl: Reformat error info of the
|
||||
HiSilicon Kunpeng920
|
||||
|
||||
Reformat the code to display the error info of HiSilicon Kunpeng920.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 15 +++++++++------
|
||||
1 file changed, 9 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 22ba1fd..eeaf885 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1669,8 +1669,9 @@ sub vendor_errors
|
||||
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
$out .= "module_id=$module_id, " if ($module_id);
|
||||
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
@@ -1692,8 +1693,9 @@ sub vendor_errors
|
||||
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
$out .= "module_id=$module_id, " if ($module_id);
|
||||
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
@@ -1717,8 +1719,9 @@ sub vendor_errors
|
||||
$out .= "core_id=$core_id, " if ($core_id);
|
||||
$out .= "port_id=$port_id, " if ($port_id);
|
||||
$out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "err_type=$err_type, \n" if ($err_type);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
From 623e85c07ab21ccc89ffe2bb444eb000a2664a9d Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 17:01:35 +0000
|
||||
Subject: [PATCH 4/6] rasdaemon: ras-mc-ctl: Add printing usage if necessary
|
||||
parameters are not passed for the HiSilicon vendor-errors options
|
||||
|
||||
Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options of the ras-mc-ctl.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index eeaf885..0e32cb1 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1542,6 +1542,7 @@ sub vendor_errors_summary
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
} else {
|
||||
+ usage(1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1649,6 +1650,7 @@ sub vendor_errors
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
} else {
|
||||
+ usage(1);
|
||||
return;
|
||||
}
|
||||
|
||||
--
|
||||
2.25.1
|
||||
|
||||
198
0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
Normal file
198
0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
Normal file
@ -0,0 +1,198 @@
|
||||
From 4007c95f8a8d570542ffc11676b619ea5649d0e7 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 18:19:38 +0000
|
||||
Subject: [PATCH 5/6] rasdaemon: ras-mc-ctl: Add support to display the
|
||||
HiSilicon vendor errors for a specified module
|
||||
|
||||
Add support to display the HiSilicon vendor errors for a specified module.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 119 ++++++++++++++++++++++++---------------------
|
||||
1 file changed, 63 insertions(+), 56 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 0e32cb1..d728300 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -96,7 +96,8 @@ Usage: $prog [OPTIONS...]
|
||||
--errors Shows the errors stored at the error database.
|
||||
--error-count Shows the corrected and uncorrected error counts using sysfs.
|
||||
--vendor-errors-summary <platform-id> Presents a summary of the vendor-specific logged errors.
|
||||
- --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
|
||||
+ --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
|
||||
+ --vendor-errors <platform-id> <module-name> Shows the vendor-specific errors for a specific module stored in the error database.
|
||||
--vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors.
|
||||
--help This help message.
|
||||
EOF
|
||||
@@ -1640,15 +1641,19 @@ sub vendor_errors_summary
|
||||
sub vendor_errors
|
||||
{
|
||||
require DBI;
|
||||
- my ($num_args, $platform_id);
|
||||
+ my ($num_args, $platform_id, $module);
|
||||
my ($query, $query_handle, $id, $timestamp, $out);
|
||||
my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
|
||||
my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
+ $module = 0;
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
+ if ($num_args gt 1) {
|
||||
+ $module = $ARGV[1];
|
||||
+ }
|
||||
} else {
|
||||
usage(1);
|
||||
return;
|
||||
@@ -1664,21 +1669,21 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1688,21 +1693,21 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1712,23 +1717,23 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "core_id=$core_id, " if ($core_id);
|
||||
- $out .= "port_id=$port_id, " if ($port_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "err_type=$err_type, " if ($err_type);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($sub_module_id && ($module eq $sub_module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1741,22 +1746,24 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "core_id=$core_id, " if ($core_id);
|
||||
- $out .= "port_id=$port_id, " if ($port_id);
|
||||
- $out .= "err_type=$err_type, " if ($err_type);
|
||||
- $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs" if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && ($module eq $module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs" if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
--
|
||||
2.25.1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
148
0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
Normal file
148
0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
Normal file
@ -0,0 +1,148 @@
|
||||
From 88bf3126312645843152c6c3215b54b120bcc1ec Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Mon, 7 Mar 2022 12:38:45 +0000
|
||||
Subject: [PATCH 6/6] rasdaemon: ras-mc-ctl: Relocate reading and display
|
||||
Kunpeng920 errors to under Kunpeng9xx
|
||||
|
||||
Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 38 ++++++++++----------------------------
|
||||
1 file changed, 10 insertions(+), 28 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index d728300..2ab9602 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1527,7 +1527,6 @@ sub errors
|
||||
|
||||
# Definitions of the vendor platform IDs.
|
||||
use constant {
|
||||
- HISILICON_KUNPENG_920 => "Kunpeng920",
|
||||
HISILICON_KUNPENG_9XX => "Kunpeng9xx",
|
||||
};
|
||||
|
||||
@@ -1549,8 +1548,8 @@ sub vendor_errors_summary
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng920 errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ # HiSilicon Kunpeng9xx common errors
|
||||
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1565,9 +1564,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1585,9 +1582,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1605,15 +1600,10 @@ sub vendor_errors_summary
|
||||
$out .= "\t$sub_module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
- }
|
||||
|
||||
- # HiSilicon Kunpeng9xx common errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1629,8 +1619,6 @@ sub vendor_errors_summary
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng9xx common errors.\n\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1661,8 +1649,8 @@ sub vendor_errors
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng920 errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ # HiSilicon Kunpeng9xx common errors
|
||||
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1683,7 +1671,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1707,7 +1695,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1733,13 +1721,10 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
- }
|
||||
|
||||
- # HiSilicon Kunpeng9xx common errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1767,8 +1752,6 @@ sub vendor_errors
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng9xx common errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1779,7 +1762,6 @@ sub vendor_errors
|
||||
sub vendor_platforms
|
||||
{
|
||||
print "\nSupported platforms for the vendor-specific errors:\n";
|
||||
- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n";
|
||||
print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
|
||||
print "\n";
|
||||
}
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -1,78 +0,0 @@
|
||||
From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Tue, 25 May 2021 20:07:26 +0800
|
||||
Subject: [PATCH 2/2] add trace print of new information and add it to sqilte
|
||||
|
||||
Since we add new information of the event, we add trace print and store it to
|
||||
Sqlite.
|
||||
|
||||
Signed-off-by: Luo Shengwei <luoshengwei@huawei.com>
|
||||
---
|
||||
ras-arm-handler.c | 10 ++++++++++
|
||||
ras-record.c | 8 ++++++++
|
||||
2 files changed, 18 insertions(+)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 10d0099..23ad470 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -23,6 +23,13 @@
|
||||
#include "ras-cpu-isolation.h"
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len)
|
||||
+{
|
||||
+ for (int i = 0; i < buf_len; ++i) {
|
||||
+ trace_seq_printf(s, "%2.2x", buf[i]);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static int is_core_failure(unsigned long value)
|
||||
{
|
||||
/*
|
||||
@@ -135,6 +142,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
case GHES_SEV_PANIC:
|
||||
ev.severity = "Fatal";
|
||||
}
|
||||
+ trace_seq_printf(s, "\n severity: %s", ev.severity);
|
||||
|
||||
if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
|
||||
int len, nums;
|
||||
@@ -142,6 +150,8 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
if (!ev.error_info)
|
||||
return -1;
|
||||
ev.length = len;
|
||||
+ trace_seq_printf(s, "\n processor_err_info: ");
|
||||
+ trace_print_hex(s, ev.error_info, len);
|
||||
/* relate to enum error_type */
|
||||
nums = count_errors(event, ev.error_info, len);
|
||||
if (nums > 0) {
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 549c494..33d4741 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = {
|
||||
{ .name="err_info", .type="BLOB" },
|
||||
{ .name="context_info", .type="BLOB" },
|
||||
{ .name="vendor_info", .type="BLOB" },
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ { .name="severity", .type="TEXT" },
|
||||
+ { .name="error_info", .type="BLOB" },
|
||||
+#endif
|
||||
};
|
||||
|
||||
static const struct db_table_descriptor arm_event_tab = {
|
||||
@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
|
||||
ev->ctx_error, ev->ctx_len, NULL);
|
||||
sqlite3_bind_blob (priv->stmt_arm_record, 9,
|
||||
ev->vsei_error, ev->oem_len, NULL);
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL);
|
||||
+#endif
|
||||
|
||||
rc = sqlite3_step(priv->stmt_arm_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
From 6b767a2fce615384f062ecb392cd332452bf4482 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Wed, 1 Sep 2021 21:00:16 +0800
|
||||
Subject: [PATCH] modify cpu parse for adapting to new bios version
|
||||
|
||||
---
|
||||
ras-cpu-isolation.c | 20 ++++++++++++++++++--
|
||||
1 file changed, 18 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index 6dcff70..b1643c4 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -25,6 +25,7 @@
|
||||
|
||||
static struct cpu_info *cpu_infos = NULL;
|
||||
static unsigned int ncores, cores_per_socket, cores_per_die;
|
||||
+static unsigned int cores_per_cluster = 4;
|
||||
static unsigned int sockets, dies = 1;
|
||||
static unsigned int enabled = 1;
|
||||
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
@@ -432,18 +433,33 @@ static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size
|
||||
|
||||
static unsigned get_cpu_index(int64_t mpidr)
|
||||
{
|
||||
- unsigned core_id, socket_id, die_id, cpu;
|
||||
+ unsigned core_id, cluster_id, socket_id, die_id, cpu;
|
||||
/*
|
||||
* Adapt to certain BIOS
|
||||
* In the MPIDR:
|
||||
* bit 8:15: core id
|
||||
+ * bit 16:18: cluster id
|
||||
* bit 19:20: die_id
|
||||
* bit 21:22: socket_id
|
||||
*/
|
||||
core_id = get_bit_value(mpidr, 8, 8);
|
||||
+ cluster_id = get_bit_value(mpidr, 16, 3);
|
||||
socket_id = get_bit_value(mpidr, 21, 2);
|
||||
die_id = get_bit_value(mpidr, 19, 2);
|
||||
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die;
|
||||
+
|
||||
+ /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
|
||||
+ * it means TotemB. When cores per die equal to cores per socket, it means
|
||||
+ * that there is only one die in the socket, in case that the only die is
|
||||
+ * TotemB in CPU 1620s, we set die id to 0 directly.
|
||||
+ */
|
||||
+ if (cores_per_die == cores_per_socket) {
|
||||
+ die_id = 0;
|
||||
+ }
|
||||
+ else {
|
||||
+ die_id = (die_id == 1 ? 0:1);
|
||||
+ }
|
||||
+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
|
||||
+ cluster_id * cores_per_cluster;
|
||||
|
||||
return cpu;
|
||||
}
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.7
|
||||
Release: 1
|
||||
Release: 4
|
||||
License: GPLv2
|
||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||
URL: https://github.com/mchehab/rasdaemon.git
|
||||
@ -23,13 +23,18 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch
|
||||
Patch2: bugfix-fix-fd-check.patch
|
||||
Patch3: bugfix-fix-disk-error-log-storm.patch
|
||||
Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
|
||||
Patch5: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
|
||||
Patch6: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
|
||||
Patch7: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
|
||||
Patch8: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
|
||||
Patch9: 0006-add-cpu-online-fault-isolation.patch
|
||||
Patch10: 0007-add-trace-print-and-add-sqlite-store.patch
|
||||
Patch11: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
||||
Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch
|
||||
Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch
|
||||
Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
|
||||
Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
|
||||
Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
|
||||
Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
|
||||
Patch11: 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
|
||||
Patch12: 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch
|
||||
Patch13: 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch
|
||||
Patch14: 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch
|
||||
Patch15: 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
|
||||
Patch16: 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
|
||||
|
||||
%description
|
||||
The rasdaemon program is a daemon which monitors the platform
|
||||
@ -75,41 +80,43 @@ rm INSTALL %{buildroot}/usr/include/*.h
|
||||
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
|
||||
|
||||
%changelog
|
||||
* Mon Jan 17 2022 xujing<xujing99@huawei.com> - 0.6.7-1
|
||||
- DESC: Update software to v0.6.7
|
||||
|
||||
* Thu Dec 9 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-10
|
||||
* Mon Mar 07 2022 Shiju Jose<shiju.jose@huawei.com> - 0.6.7-4
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Enable compilation of the feature memory fault prediction based on
|
||||
- DESC:
|
||||
1. Modify recording Hisilicon common error data in the rasdaemon and
|
||||
2. In the ras-mc-ctl,
|
||||
2.1. Improve Hisilicon common error statistics.
|
||||
2.2. Add support to display the HiSilicon vendor-errors for a specified module.
|
||||
2.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
|
||||
2.4. Reformat error info of the HiSilicon Kunpeng920.
|
||||
2.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
|
||||
|
||||
* Wed Mar 2 2022 tanxiaofei<tanxiaofei@huawei.com> - 0.6.7-3
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:
|
||||
1. Backport 4 patches from openEuler master branch.
|
||||
1) Fix the issue of sprintf data type mismatch in uuid_le()
|
||||
2) Fix the issue of command option -r for hip08
|
||||
3) Fix some print format issues for hisi common error section
|
||||
4) Add some modules supported by hisi common error section
|
||||
2.Enable compilation of the feature memory fault prediction based on
|
||||
corrected error.
|
||||
3.Fix changelog date error of this spec file.
|
||||
|
||||
* Thu Dec 2 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-9
|
||||
* Wed Feb 23 2022 luoshengwei<luoshengwei@huawei.com> - 0.6.7-2
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Backport memory failure feature, one patch.
|
||||
- DESC: Add cpu online fault isolation for arm event.
|
||||
|
||||
* Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-8
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Sync three patches, add cpu online fault isolation.
|
||||
* Wed Dec 8 2021 xujing <xujing99@huawei.com> - 0.6.7-1
|
||||
- Update software to v0.6.7
|
||||
|
||||
* Wed Oct 20 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-7
|
||||
- Type:Bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Backport one patch, and some little fixes and add some modules
|
||||
support for kunpeng series:
|
||||
1. Modify non-standard error decoding interface using linked list
|
||||
2. Fix the issue of sprintf data type mismatch in uuid_le()
|
||||
3. Fix the issue of command option -r for hip08
|
||||
4. Fix some print format issues for hisi common error section
|
||||
5. Add some modules supported by hisi common error section
|
||||
|
||||
* Sat July 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-6
|
||||
* Thu Jul 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-6
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user