diff --git a/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch deleted file mode 100644 index d17fb21..0000000 --- a/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch +++ /dev/null @@ -1,938 +0,0 @@ -From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001 -From: Shengwei Luo -Date: Wed, 23 Feb 2022 17:21:58 +0800 -Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected - errors - -When the corrected errors exceed the set limit in cycle, try to -offline the related cpu core. - -Signed-off-by: Shengwei Luo -Signed-off-by: Junchong Pan -Signed-off-by: Lei Feng -Signed-off-by: Shiju Jose ---- - Makefile.am | 6 +- - configure.ac | 11 ++ - misc/rasdaemon.env | 17 ++ - queue.c | 119 ++++++++++++++ - queue.h | 39 +++++ - ras-arm-handler.c | 97 +++++++++++ - ras-arm-handler.h | 18 ++ - ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 68 ++++++++ - ras-events.c | 9 +- - 10 files changed, 770 insertions(+), 2 deletions(-) - create mode 100644 queue.c - create mode 100644 queue.h - create mode 100644 ras-cpu-isolation.c - create mode 100644 ras-cpu-isolation.h - -diff --git a/Makefile.am b/Makefile.am -index a322b9a..36e7d4e 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -69,13 +69,17 @@ endif - if WITH_AMP_NS_DECODE - rasdaemon_SOURCES += non-standard-ampere.c - endif -+if WITH_CPU_FAULT_ISOLATION -+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c -+endif - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ -- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h -+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -+ ras-cpu-isolation.h queue.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index a77991f..e0ed751 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], - AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) - -+AC_ARG_ENABLE([cpu_fault_isolation], -+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) -+ -+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation") -+ AC_SUBST([WITH_CPU_FAULT_ISOLATION]) -+]) -+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -201,4 +211,5 @@ compile time options summary - Memory Failure : $USE_MEMORY_FAILURE - Memory CE PFA : $USE_MEMORY_CE_PFA - AMP RAS errors : $USE_AMP_NS_DECODE -+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION - EOF -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..7cb18e8 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". - PAGE_CE_ACTION="soft" -+ -+# CPU Online Fault Isolation -+# Whether to enable cpu online fault isolation (yes|no). -+CPU_ISOLATION_ENABLE="no" -+# Specify the threshold of CE numbers. -+# -+# Format: -+# [0-9]+[unit] -+# -+# Supported units: -+# CPU_CE_THRESHOLD: no unit -+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second -+CPU_CE_THRESHOLD="18" -+CPU_ISOLATION_CYCLE="24h" -+ -+# Prevent excessive isolation from causing an avalanche effect -+CPU_ISOLATION_LIMIT="10" -\ No newline at end of file -diff --git a/queue.c b/queue.c -new file mode 100644 -index 0000000..65b6fb8 ---- /dev/null -+++ b/queue.c -@@ -0,0 +1,119 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+#include -+#include -+#include "queue.h" -+#include "ras-logger.h" -+ -+int is_empty(struct link_queue *queue) -+{ -+ if (queue) -+ return queue->size == 0; -+ -+ return 1; -+} -+ -+struct link_queue *init_queue(void) -+{ -+ struct link_queue *queue = NULL; -+ -+ queue = (struct link_queue *)malloc(sizeof(struct link_queue)); -+ if (queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); -+ return NULL; -+ } -+ -+ queue->size = 0; -+ queue->head = NULL; -+ queue->tail = NULL; -+ -+ return queue; -+} -+ -+void clear_queue(struct link_queue *queue) -+{ -+ if (queue == NULL) -+ return; -+ -+ struct queue_node *node = queue->head; -+ struct queue_node *tmp = NULL; -+ -+ while (node != NULL) { -+ tmp = node; -+ node = node->next; -+ free(tmp); -+ } -+ -+ queue->head = NULL; -+ queue->tail = NULL; -+ queue->size = 0; -+} -+ -+void free_queue(struct link_queue *queue) -+{ -+ clear_queue(queue); -+ -+ if (queue) -+ free(queue); -+} -+ -+/* It should be guranteed that the param is not NULL */ -+void push(struct link_queue *queue, struct queue_node *node) -+{ -+ /* there is no element in the queue */ -+ if (queue->head == NULL) -+ queue->head = node; -+ else -+ queue->tail->next = node; -+ -+ queue->tail = node; -+ (queue->size)++; -+} -+ -+int pop(struct link_queue *queue) -+{ -+ struct queue_node *tmp = NULL; -+ -+ if (queue == NULL || is_empty(queue)) -+ return -1; -+ -+ tmp = queue->head; -+ queue->head = queue->head->next; -+ free(tmp); -+ (queue->size)--; -+ -+ return 0; -+} -+ -+struct queue_node *front(struct link_queue *queue) -+{ -+ if (queue == NULL) -+ return NULL; -+ -+ return queue->head; -+} -+ -+struct queue_node *node_create(time_t time, unsigned int value) -+{ -+ struct queue_node *node = NULL; -+ -+ node = (struct queue_node *)malloc(sizeof(struct queue_node)); -+ if (node != NULL) { -+ node->time = time; -+ node->value = value; -+ node->next = NULL; -+ } -+ -+ return node; -+} -diff --git a/queue.h b/queue.h -new file mode 100644 -index 0000000..5459f40 ---- /dev/null -+++ b/queue.h -@@ -0,0 +1,39 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#ifndef __RAS_QUEUE_H -+#define __RAS_QUEUE_H -+ -+struct queue_node { -+ time_t time; -+ unsigned int value; -+ struct queue_node *next; -+}; -+ -+struct link_queue { -+ struct queue_node *head; -+ struct queue_node *tail; -+ int size; -+}; -+ -+int is_empty(struct link_queue *queue); -+struct link_queue *init_queue(void); -+void clear_queue(struct link_queue *queue); -+void free_queue(struct link_queue *queue); -+void push(struct link_queue *queue, struct queue_node *node); -+int pop(struct link_queue *queue); -+struct queue_node *front(struct link_queue *queue); -+struct queue_node *node_create(time_t time, unsigned int value); -+ -+#endif -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..9c7a3c3 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -22,6 +22,10 @@ - #include "ras-report.h" - #include "ras-non-standard-handler.h" - #include "non-standard-ampere.h" -+#include "ras-cpu-isolation.h" -+ -+#define ARM_ERR_VALID_ERROR_COUNT BIT(0) -+#define ARM_ERR_VALID_FLAGS BIT(1) - - void display_raw_data(struct trace_seq *s, - const uint8_t *buf, -@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s, - } - } - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+static int count_errors(struct ras_arm_event *ev) -+{ -+ struct ras_arm_err_info *err_info; -+ int num_pei; -+ int err_info_size = sizeof(struct ras_arm_err_info); -+ int num = 0; -+ int i; -+ int error_count; -+ -+ if (ev->pei_len % err_info_size != 0) { -+ log(TERM, LOG_ERR, -+ "The event data does not match to the ARM Processor Error Information Structure\n"); -+ return num; -+ } -+ num_pei = ev->pei_len / err_info_size; -+ err_info = (struct ras_arm_err_info *)(ev->pei_error); -+ -+ for (i = 0; i < num_pei; ++i) { -+ error_count = 1; -+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { -+ /* -+ * The value of this field is defined as follows: -+ * 0: Single Error -+ * 1: Multiple Errors -+ * 2-65535: Error Count -+ */ -+ error_count = err_info->multiple_error + 1; -+ } -+ -+ num += error_count; -+ err_info += 1; -+ } -+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); -+ return num; -+} -+ -+static int ras_handle_cpu_error(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, -+ struct ras_arm_event *ev, time_t now) -+{ -+ unsigned long long val; -+ int cpu; -+ char *severity; -+ struct error_info err_info; -+ -+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) -+ return -1; -+ cpu = val; -+ trace_seq_printf(s, "\n cpu: %d", cpu); -+ -+ /* record cpu error */ -+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) -+ return -1; -+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */ -+ switch (val) { -+ case GHES_SEV_NO: -+ severity = "Informational"; -+ break; -+ case GHES_SEV_CORRECTED: -+ severity = "Corrected"; -+ break; -+ case GHES_SEV_RECOVERABLE: -+ severity = "Recoverable"; -+ break; -+ default: -+ case GHES_SEV_PANIC: -+ severity = "Fatal"; -+ } -+ trace_seq_printf(s, "\n severity: %s", severity); -+ -+ if (val == GHES_SEV_CORRECTED) { -+ int nums = count_errors(ev); -+ -+ if (nums > 0) { -+ err_info.nums = nums; -+ err_info.time = now; -+ err_info.err_type = val; -+ ras_record_cpu_error(&err_info, cpu); -+ } -+ } -+ -+ return 0; -+} -+#endif -+ - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, - struct tm *tm; - struct ras_arm_event ev; - int len = 0; -+ - memset(&ev, 0, sizeof(ev)); - - /* -@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s, - display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) -+ return -1; -+#endif -+ - /* Insert data into the SGBD */ - #ifdef HAVE_SQLITE3 - ras_store_arm_record(ras, &ev); -diff --git a/ras-arm-handler.h b/ras-arm-handler.h -index 563a2d3..52813e7 100644 ---- a/ras-arm-handler.h -+++ b/ras-arm-handler.h -@@ -17,6 +17,24 @@ - #include "ras-events.h" - #include "libtrace/event-parse.h" - -+/* -+ * ARM Processor Error Information Structure, According to -+ * UEFI_2_9 specification chapter N2.4.4. -+ */ -+#pragma pack(1) -+struct ras_arm_err_info { -+ uint8_t version; -+ uint8_t length; -+ uint16_t validation_bits; -+ uint8_t type; -+ uint16_t multiple_error; -+ uint8_t flags; -+ uint64_t error_info; -+ uint64_t virt_fault_addr; -+ uint64_t physical_fault_addr; -+}; -+#pragma pack() -+ - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context); -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -new file mode 100644 -index 0000000..abcf451 ---- /dev/null -+++ b/ras-cpu-isolation.c -@@ -0,0 +1,388 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "ras-cpu-isolation.h" -+ -+#define SECOND_OF_MON (30 * 24 * 60 * 60) -+#define SECOND_OF_DAY (24 * 60 * 60) -+#define SECOND_OF_HOU (60 * 60) -+#define SECOND_OF_MIN (60) -+ -+#define LIMIT_OF_CPU_THRESHOLD 10000 -+#define INIT_OF_CPU_THRESHOLD 18 -+#define DEC_CHECK 10 -+#define LAST_BIT_OF_UL 5 -+ -+static struct cpu_info *cpu_infos; -+static unsigned int ncores; -+static unsigned int enabled = 1; -+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -+ -+static const struct param normal_units[] = { -+ {"", 1}, -+ {} -+}; -+ -+static const struct param cycle_units[] = { -+ {"d", SECOND_OF_DAY}, -+ {"h", SECOND_OF_HOU}, -+ {"m", SECOND_OF_MIN}, -+ {"s", 1}, -+ {} -+}; -+ -+static struct isolation_param threshold = { -+ .name = "CPU_CE_THRESHOLD", -+ .units = normal_units, -+ .value = INIT_OF_CPU_THRESHOLD, -+ .limit = LIMIT_OF_CPU_THRESHOLD -+}; -+ -+static struct isolation_param cpu_limit = { -+ .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units -+}; -+ -+static struct isolation_param cycle = { -+ .name = "CPU_ISOLATION_CYCLE", -+ .units = cycle_units, -+ .value = SECOND_OF_DAY, -+ .limit = SECOND_OF_MON -+}; -+ -+static const char * const cpu_state[] = { -+ [CPU_OFFLINE] = "offline", -+ [CPU_ONLINE] = "online", -+ [CPU_OFFLINE_FAILED] = "offline-failed", -+ [CPU_UNKNOWN] = "unknown" -+}; -+ -+static int open_sys_file(unsigned int cpu, int __oflag, const char *format) -+{ -+ int fd; -+ char path[MAX_PATH_LEN + 1] = ""; -+ char real_path[MAX_PATH_LEN + 1] = ""; -+ -+ snprintf(path, sizeof(path), format, cpu); -+ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); -+ return -1; -+ } -+ fd = open(real_path, __oflag); -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path); -+ return -1; -+ } -+ -+ return fd; -+} -+ -+static int get_cpu_status(unsigned int cpu) -+{ -+ int fd, num; -+ char buf[2] = ""; -+ -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ if (fd == -1) -+ return CPU_UNKNOWN; -+ -+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) -+ num = CPU_UNKNOWN; -+ -+ close(fd); -+ -+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; -+} -+ -+static int init_cpu_info(unsigned int cpus) -+{ -+ ncores = cpus; -+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, -+ "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].ce_nums = 0; -+ cpu_infos[i].state = get_cpu_status(i); -+ cpu_infos[i].ce_queue = init_queue(); -+ -+ if (cpu_infos[i].ce_queue == NULL) { -+ log(TERM, LOG_ERR, -+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); -+ return -1; -+ } -+ } -+ /* set limit of offlined cpu limit according to number of cpu */ -+ cpu_limit.limit = cpus - 1; -+ cpu_limit.value = 0; -+ -+ return 0; -+} -+ -+static void check_config(struct isolation_param *config) -+{ -+ if (config->value > config->limit) { -+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", -+ config->value, config->limit); -+ config->value = config->limit; -+ } -+} -+ -+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) -+{ -+ char *unit = NULL; -+ int env_size, has_unit = 0; -+ -+ if (!env || strlen(env) == 0) -+ return -1; -+ -+ env_size = strlen(env); -+ unit = env + env_size - 1; -+ -+ if (isalpha(*unit)) { -+ has_unit = 1; -+ env_size--; -+ if (env_size <= 0) -+ return -1; -+ } -+ -+ for (int i = 0; i < env_size; ++i) { -+ if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / DEC_CHECK || -+ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = DEC_CHECK * (*value) + (env[i] - '0'); -+ } else -+ return -1; -+ } -+ -+ if (!has_unit) -+ return 0; -+ -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, -+ "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; -+ } -+ } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; -+} -+ -+static void init_config(struct isolation_param *config) -+{ -+ char *env = getenv(config->name); -+ unsigned long value = 0; -+ -+ if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", -+ config->name, env, config->value); -+ return; -+ } -+ -+ config->value = value; -+ check_config(config); -+} -+ -+static int check_config_status(void) -+{ -+ char *env = getenv("CPU_ISOLATION_ENABLE"); -+ -+ if (env == NULL || strcasecmp(env, "yes")) -+ return -1; -+ -+ return 0; -+} -+ -+void ras_cpu_isolation_init(unsigned int cpus) -+{ -+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ enabled = 0; -+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); -+ init_config(&threshold); -+ init_config(&cpu_limit); -+ init_config(&cycle); -+} -+ -+void cpu_infos_free(void) -+{ -+ if (cpu_infos) { -+ for (int i = 0; i < ncores; ++i) -+ free_queue(cpu_infos[i].ce_queue); -+ -+ free(cpu_infos); -+ } -+} -+ -+static int do_cpu_offline(unsigned int cpu) -+{ -+ int fd, rc; -+ char buf[2] = ""; -+ -+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; -+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); -+ if (fd == -1) -+ return HANDLE_FAILED; -+ -+ strcpy(buf, "0"); -+ rc = write(fd, buf, strlen(buf)); -+ if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno); -+ close(fd); -+ return HANDLE_FAILED; -+ } -+ -+ close(fd); -+ /* check wthether the cpu is isolated successfully */ -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state == CPU_OFFLINE) -+ return HANDLE_SUCCEED; -+ -+ return HANDLE_FAILED; -+} -+ -+static int do_ce_handler(unsigned int cpu) -+{ -+ struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ unsigned int tmp; -+ /* -+ * Since we just count all error numbers in setted cycle, we store the time -+ * and error numbers from current event to the queue, then everytime we -+ * calculate the period from beginning time to ending time, if the period -+ * exceeds setted cycle, we pop the beginning time and error until the period -+ * from new beginning time to ending time is less than cycle. -+ */ -+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { -+ tmp = queue->head->value; -+ if (pop(queue) == 0) -+ cpu_infos[cpu].ce_nums -= tmp; -+ } -+ log(TERM, LOG_INFO, -+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", -+ cpu, cpu_infos[cpu].ce_nums); -+ -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, -+ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", -+ threshold.value, cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} -+ -+static int error_handler(unsigned int cpu, struct error_info *err_info) -+{ -+ int ret = HANDLE_NOTHING; -+ -+ switch (err_info->err_type) { -+ case CE: -+ ret = do_ce_handler(cpu); -+ break; -+ default: -+ break; -+ } -+ -+ return ret; -+} -+ -+static void record_error_info(unsigned int cpu, struct error_info *err_info) -+{ -+ switch (err_info->err_type) { -+ case CE: -+ { -+ struct queue_node *node = node_create(err_info->time, err_info->nums); -+ -+ if (node == NULL) { -+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); -+ return; -+ } -+ push(cpu_infos[cpu].ce_queue, node); -+ cpu_infos[cpu].ce_nums += err_info->nums; -+ break; -+ } -+ default: -+ break; -+ } -+} -+ -+void ras_record_cpu_error(struct error_info *err_info, int cpu) -+{ -+ int ret; -+ -+ if (enabled == 0) -+ return; -+ -+ if (cpu >= ncores || cpu < 0) { -+ log(TERM, LOG_ERR, -+ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); -+ return; -+ } -+ -+ record_error_info(cpu, err_info); -+ /* -+ * Since user may change cpu state, we get current offlined -+ * cpu numbers every recording time. -+ */ -+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { -+ log(TERM, LOG_WARNING, -+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", -+ cpu_limit.value); -+ return; -+ } -+ -+ ret = error_handler(cpu, err_info); -+ if (ret == HANDLE_NOTHING) -+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); -+ else if (ret == HANDLE_SUCCEED) { -+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ clear_queue(cpu_infos[cpu].ce_queue); -+ cpu_infos[cpu].ce_nums = 0; -+ } else -+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+} -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -new file mode 100644 -index 0000000..1159853 ---- /dev/null -+++ b/ras-cpu-isolation.h -@@ -0,0 +1,68 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#ifndef __RAS_CPU_ISOLATION_H -+#define __RAS_CPU_ISOLATION_H -+ -+#include "queue.h" -+ -+#define MAX_PATH_LEN 100 -+#define MAX_BUF_LEN 1024 -+ -+struct param { -+ char *name; -+ unsigned long value; -+}; -+ -+struct isolation_param { -+ char *name; -+ const struct param *units; -+ unsigned long value; -+ unsigned long limit; -+}; -+ -+enum cpu_state { -+ CPU_OFFLINE, -+ CPU_ONLINE, -+ CPU_OFFLINE_FAILED, -+ CPU_UNKNOWN, -+}; -+ -+enum error_handle_result { -+ HANDLE_FAILED = -1, -+ HANDLE_SUCCEED, -+ HANDLE_NOTHING, -+}; -+ -+enum error_type { -+ CE = 1 -+}; -+ -+struct cpu_info { -+ unsigned long ce_nums; -+ struct link_queue *ce_queue; -+ enum cpu_state state; -+}; -+ -+struct error_info { -+ unsigned long nums; -+ time_t time; -+ enum error_type err_type; -+}; -+ -+void ras_cpu_isolation_init(unsigned int cpus); -+void ras_record_cpu_error(struct error_info *err_info, int cpu); -+void cpu_infos_free(void); -+ -+#endif -diff --git a/ras-events.c b/ras-events.c -index 39cab20..beda655 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -42,6 +42,7 @@ - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" -+#include "ras-cpu-isolation.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -856,6 +857,10 @@ int handle_ras_events(int record_events) - - cpus = get_num_cpus(ras); - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_cpu_isolation_init(cpus); -+#endif -+ - #ifdef HAVE_MCE - rc = register_mce_handler(ras, cpus); - if (rc) -@@ -982,6 +987,8 @@ err: - } - free(ras); - } -- -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ cpu_infos_free(); -+#endif - return rc; - } --- -2.25.1 - diff --git a/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch b/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch deleted file mode 100644 index 2409a51..0000000 --- a/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch +++ /dev/null @@ -1,46 +0,0 @@ -From: Xiaofei Tan -Date: Sat, 20 Aug 2022 09:49:25 +0000 -Subject: [PATCH] rasdaemon: use standard length PATH_MAX for path name - -Use standard length PATH_MAX for path name space allocation -to replace the macro MAX_PATH_LEN. - -Signed-off-by: Xiaofei Tan ---- - ras-cpu-isolation.c | 6 +++--- - ras-cpu-isolation.h | 1 - - 2 files changed, 3 insertions(+), 4 deletions(-) - -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index ba5ccd1..24c07e9 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -80,11 +80,11 @@ static const char * const cpu_state[] = { - static int open_sys_file(unsigned int cpu, int __oflag, const char *format) - { - int fd; -- char path[MAX_PATH_LEN + 1] = ""; -- char real_path[MAX_PATH_LEN + 1] = ""; -+ char path[PATH_MAX] = ""; -+ char real_path[PATH_MAX] = ""; - - snprintf(path, sizeof(path), format, cpu); -- if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { -+ if (strlen(path) > PATH_MAX || realpath(path, real_path) == NULL) { - log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); - return -1; - } -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -index 024a68b..5682106 100644 ---- a/ras-cpu-isolation.h -+++ b/ras-cpu-isolation.h -@@ -17,7 +17,6 @@ - - #include "queue.h" - --#define MAX_PATH_LEN 100 - #define MAX_BUF_LEN 1024 - - struct param { --- -2.17.1 diff --git a/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch b/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch deleted file mode 100644 index 85ae07d..0000000 --- a/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 6986d818e6d2c846c001fc7211b5a4153e5ecd11 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 4 Feb 2023 19:15:55 +0000 -Subject: [PATCH] rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks - indefinitely - -The error events are not received in the rasdaemon since kernel 6.1-rc6. -This issue is firstly detected and reported, when testing the CXL error -events in the rasdaemon. - -Debugging showed, poll() on trace_pipe_raw in the ras-events.c do not -return and this issue is seen after the commit -42fb0a1e84ff525ebe560e2baf9451ab69127e2b ("tracing/ring-buffer: Have -polling block on watermark"). - -This issue is also verified using a test application for poll() -and select() on per_cpu trace_pipe_raw. - -There is also a bug reported on this issue, -https://lore.kernel.org/all/31eb3b12-3350-90a4-a0d9-d1494db7cf74@oracle.com/ - -This issue occurs for the per_cpu case, which calls the ring_buffer_poll_wait(), -in kernel/trace/ring_buffer.c, with the buffer_percent > 0 and then wait until -the percentage of pages are available. The default value set for the -buffer_percent is 50 in the kernel/trace/trace.c. However poll() does not return -even met the percentage of pages condition. - -As a fix, rasdaemon set buffer_percent as 0 through the -/sys/kernel/debug/tracing/instances/rasdaemon/buffer_percent, then the -task will wake up as soon as data is added to any of the specific cpu -buffer and poll() on per_cpu/cpuX/trace_pipe_raw does not block -indefinitely. - -Dependency on the kernel fix commit -3e46d910d8acf94e5360126593b68bf4fee4c4a1("tracing: Fix poll() and select() -do not work on per_cpu trace_pipe and trace_pipe_raw") - -Signed-off-by: Shiju Jose ---- - ras-events.c | 22 ++++++++++++++++++++++ - 1 file changed, 22 insertions(+) - -diff --git a/ras-events.c b/ras-events.c -index 39f9ce2..49e4f9a 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -376,6 +376,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - int warnonce[n_cpus]; - char pipe_raw[PATH_MAX]; - int legacy_kernel = 0; -+ int fd; -+ char buf[16]; - #if 0 - int need_sleep = 0; - #endif -@@ -395,6 +397,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - return -ENOMEM; - } - -+ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks -+ * indefinitely with the default buffer_percent in the kernel trace system, -+ * which is introduced by the following change in the kernel. -+ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. -+ * Set buffer_percent to 0 so that poll() will return immediately -+ * when the trace data is available in the ras per_cpu trace pipe_raw -+ */ -+ fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY); -+ if (fd >= 0) { -+ /* For the backward compatibility to the old kernels, do not return -+ * if fail to set the buffer_percent. -+ */ -+ snprintf(buf, sizeof(buf), "0"); -+ size = write(fd, buf, strlen(buf)); -+ if (size <= 0) -+ log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); -+ close(fd); -+ } else -+ log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); -+ - for (i = 0; i < (n_cpus + 1); i++) - fds[i].fd = -1; - --- -2.25.1 - diff --git a/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch deleted file mode 100644 index e401fa9..0000000 --- a/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch +++ /dev/null @@ -1,150 +0,0 @@ -From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001 -From: Shengwei Luo -Date: Wed, 23 Feb 2022 17:23:27 +0800 -Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable - errors - -When the recoverable errors in cpu core occurred, try to offline -the related cpu core. - -Signed-off-by: Shengwei Luo -Signed-off-by: Junchong Pan -Signed-off-by: Lei Feng -Signed-off-by: Shiju Jose ---- - ras-arm-handler.c | 22 +++++++++++++++++++--- - ras-cpu-isolation.c | 17 +++++++++++++++++ - ras-cpu-isolation.h | 4 +++- - 3 files changed, 39 insertions(+), 4 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 9c7a3c3..a0dfc51 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -26,6 +26,7 @@ - - #define ARM_ERR_VALID_ERROR_COUNT BIT(0) - #define ARM_ERR_VALID_FLAGS BIT(1) -+#define BIT2 2 - - void display_raw_data(struct trace_seq *s, - const uint8_t *buf, -@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, - } - - #ifdef HAVE_CPU_FAULT_ISOLATION --static int count_errors(struct ras_arm_event *ev) -+static int is_core_failure(struct ras_arm_err_info *err_info) -+{ -+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { -+ /* -+ * core failure: -+ * Bit 0\1\3: (at lease 1) -+ * Bit 2: 0 -+ */ -+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); -+ } -+ return 0; -+} -+ -+static int count_errors(struct ras_arm_event *ev, int sev) - { - struct ras_arm_err_info *err_info; - int num_pei; -@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) - */ - error_count = err_info->multiple_error + 1; - } -+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) -+ error_count = 0; - - num += error_count; - err_info += 1; -@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, - } - trace_seq_printf(s, "\n severity: %s", severity); - -- if (val == GHES_SEV_CORRECTED) { -- int nums = count_errors(ev); -+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ int nums = count_errors(ev, val); - - if (nums > 0) { - err_info.nums = nums; -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index abcf451..fd23e4e 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) - - for (unsigned int i = 0; i < cpus; ++i) { - cpu_infos[i].ce_nums = 0; -+ cpu_infos[i].uce_nums = 0; - cpu_infos[i].state = get_cpu_status(i); - cpu_infos[i].ce_queue = init_queue(); - -@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) - return HANDLE_NOTHING; - } - -+static int do_uce_handler(unsigned int cpu) -+{ -+ if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} -+ - static int error_handler(unsigned int cpu, struct error_info *err_info) - { - int ret = HANDLE_NOTHING; -@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) - case CE: - ret = do_ce_handler(cpu); - break; -+ case UCE: -+ ret = do_uce_handler(cpu); -+ break; - default: - break; - } -@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) - cpu_infos[cpu].ce_nums += err_info->nums; - break; - } -+ case UCE: -+ cpu_infos[cpu].uce_nums++; -+ break; - default: - break; - } -@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - cpu, cpu_state[cpu_infos[cpu].state]); - clear_queue(cpu_infos[cpu].ce_queue); - cpu_infos[cpu].ce_nums = 0; -+ cpu_infos[cpu].uce_nums = 0; - } else - log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", - cpu, cpu_state[cpu_infos[cpu].state]); -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -index 1159853..024a68b 100644 ---- a/ras-cpu-isolation.h -+++ b/ras-cpu-isolation.h -@@ -46,10 +46,12 @@ enum error_handle_result { - }; - - enum error_type { -- CE = 1 -+ CE = 1, -+ UCE - }; - - struct cpu_info { -+ unsigned long uce_nums; - unsigned long ce_nums; - struct link_queue *ce_queue; - enum cpu_state state; --- -2.25.1 - diff --git a/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch b/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch index 89ecf80..bd229cf 100644 --- a/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch +++ b/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch @@ -82,7 +82,7 @@ index 9941e68..8fd7117 100644 #include #include #include --#include "libtrace/kbuffer.h" +-#include -#include "ras-memory-failure-handler.h" #include "ras-record.h" #include "ras-logger.h" diff --git a/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch deleted file mode 100644 index c51e35a..0000000 --- a/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +++ /dev/null @@ -1,228 +0,0 @@ -From 9c4665f33c39ea84db7d69079ab27205d2fbd07e Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Wed, 2 Mar 2022 12:20:40 +0000 -Subject: [PATCH 03/10] rasdaemon: Modify recording Hisilicon common error data - -The error statistics for the Hisilicon common -error need to do based on module, error severity etc. - -Modify recording Hisilicon common error data as separate fields -in the sql db table instead of the combined single field. - -Signed-off-by: Shiju Jose ---- - non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++------- - 1 file changed, 104 insertions(+), 22 deletions(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 1432163..d1e1774 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -17,6 +17,7 @@ - #include "non-standard-hisilicon.h" - - #define HISI_BUF_LEN 2048 -+#define HISI_PCIE_INFO_BUF_LEN 256 - - struct hisi_common_error_section { - uint32_t val_bits; -@@ -63,12 +64,25 @@ enum { - enum { - HISI_COMMON_FIELD_ID, - HISI_COMMON_FIELD_TIMESTAMP, -- HISI_COMMON_FIELD_ERR_INFO, -+ HISI_COMMON_FIELD_VERSION, -+ HISI_COMMON_FIELD_SOC_ID, -+ HISI_COMMON_FIELD_SOCKET_ID, -+ HISI_COMMON_FIELD_TOTEM_ID, -+ HISI_COMMON_FIELD_NIMBUS_ID, -+ HISI_COMMON_FIELD_SUB_SYSTEM_ID, -+ HISI_COMMON_FIELD_MODULE_ID, -+ HISI_COMMON_FIELD_SUB_MODULE_ID, -+ HISI_COMMON_FIELD_CORE_ID, -+ HISI_COMMON_FIELD_PORT_ID, -+ HISI_COMMON_FIELD_ERR_TYPE, -+ HISI_COMMON_FIELD_PCIE_INFO, -+ HISI_COMMON_FIELD_ERR_SEVERITY, - HISI_COMMON_FIELD_REGS_DUMP, - }; - - struct hisi_event { - char error_msg[HISI_BUF_LEN]; -+ char pcie_info[HISI_PCIE_INFO_BUF_LEN]; - char reg_msg[HISI_BUF_LEN]; - }; - -@@ -132,14 +146,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - - #ifdef HAVE_SQLITE3 - static const struct db_fields hisi_common_section_fields[] = { -- { .name = "id", .type = "INTEGER PRIMARY KEY" }, -- { .name = "timestamp", .type = "TEXT" }, -- { .name = "err_info", .type = "TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "version", .type = "INTEGER" }, -+ { .name = "soc_id", .type = "INTEGER" }, -+ { .name = "socket_id", .type = "INTEGER" }, -+ { .name = "totem_id", .type = "INTEGER" }, -+ { .name = "nimbus_id", .type = "INTEGER" }, -+ { .name = "sub_system_id", .type = "INTEGER" }, -+ { .name = "module_id", .type = "TEXT" }, -+ { .name = "sub_module_id", .type = "INTEGER" }, -+ { .name = "core_id", .type = "INTEGER" }, -+ { .name = "port_id", .type = "INTEGER" }, -+ { .name = "err_type", .type = "INTEGER" }, -+ { .name = "pcie_info", .type = "TEXT" }, -+ { .name = "err_severity", .type = "TEXT" }, - { .name = "regs_dump", .type = "TEXT" }, - }; - - static const struct db_table_descriptor hisi_common_section_tab = { -- .name = "hisi_common_section", -+ .name = "hisi_common_section_v2", - .fields = hisi_common_section_fields, - .num_fields = ARRAY_SIZE(hisi_common_section_fields), - }; -@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id) - return soc_desc[soc_id]; - } - --static void decode_module(struct hisi_event *event, uint8_t module_id) -+static void decode_module(struct ras_ns_ev_decoder *ev_decoder, -+ struct hisi_event *event, uint8_t module_id) - { -- if (module_id >= sizeof(module_name)/sizeof(char *)) -+ if (module_id >= sizeof(module_name)/sizeof(char *)) { - HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); -- else -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_MODULE_ID, -+ 0, "unknown"); -+ } else { - HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_MODULE_ID, -+ 0, module_name[module_id]); -+ } - } - - static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, -@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, - struct hisi_event *event) - { - HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); -- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_VERSION, -+ err->version, NULL); -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) { - HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SOC_ID, -+ err->soc_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) { - HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SOCKET_ID, -+ err->socket_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) { - HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_TOTEM_ID, -+ err->totem_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) { - HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_NIMBUS_ID, -+ err->nimbus_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) { - HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SUB_SYSTEM_ID, -+ err->subsystem_id, NULL); -+ } - - if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) -- decode_module(event, err->module_id); -+ decode_module(ev_decoder, event, err->module_id); - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) { - HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SUB_MODULE_ID, -+ err->submodule_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) { - HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_CORE_ID, -+ err->core_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) { - HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_PORT_ID, -+ err->port_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) { - HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_ERR_TYPE, -+ err->err_type, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) { - HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", - err->pcie_info.segment, err->pcie_info.bus, - err->pcie_info.device, err->pcie_info.function); -+ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x", -+ err->pcie_info.segment, err->pcie_info.bus, -+ err->pcie_info.device, err->pcie_info.function); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_PCIE_INFO, -+ 0, event->pcie_info); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) { - HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_ERR_SEVERITY, -+ 0, err_severity(err->err_severity)); -+ } - - HISI_SNPRINTF(event->error_msg, "]"); - } -@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras, - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_TIMESTAMP, - 0, event->timestamp); -- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); - step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); --- -2.25.1 - diff --git a/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch b/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch deleted file mode 100644 index e1c86b4..0000000 --- a/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch +++ /dev/null @@ -1,35 +0,0 @@ -From c46f65e1315aab8585e24d24223bd56c8931202a Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 31 Oct 2022 18:36:26 +0800 -Subject: [PATCH 4/4] rasdaemon: Add four modules supported by HiSilicon common - section - -Add four modules supported by HiSilicon common error section. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisilicon.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 0ddb5ec..7296d28 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -214,7 +214,11 @@ static const char* module_name[] = { - "Tsensor", - "ROH", - "BTC", -- "HILINK" -+ "HILINK", -+ "STARS", -+ "SDMA", -+ "UC", -+ "HBMC", - }; - - static const char* get_soc_desc(uint8_t soc_id) --- -2.25.1 - diff --git a/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch deleted file mode 100644 index 8963d91..0000000 --- a/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +++ /dev/null @@ -1,97 +0,0 @@ -From 4f706ff3b1a04de3be506a309e153b99e04b3445 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 24 Feb 2022 18:02:14 +0000 -Subject: [PATCH 04/10] rasdaemon: ras-mc-ctl: Modify error statistics for - HiSilicon KunPeng9xx common errors - -Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors -to display the statistics and error info based on the module and the error severity. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++----------- - 1 file changed, 29 insertions(+), 11 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index b22dd60..08eb287 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1537,7 +1537,7 @@ sub vendor_errors_summary - require DBI; - my ($num_args, $platform_id); - my ($query, $query_handle, $count, $out); -- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); -+ my ($module_id, $sub_module_id, $err_severity, $err_sev); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1614,13 +1614,18 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $query = "select err_info, count(*) from hisi_common_section"; -+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($err_info, $count)); -+ $query_handle->bind_columns(\($err_severity, $module_id, $count)); - $out = ""; -+ $err_sev = ""; - while($query_handle->fetch()) { -- $out .= "\terrors: $count\n"; -+ if ($err_severity ne $err_sev) { -+ $out .= "$err_severity errors:\n"; -+ $err_sev = $err_severity; -+ } -+ $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -@@ -1638,8 +1643,8 @@ sub vendor_errors - require DBI; - my ($num_args, $platform_id); - my ($query, $query_handle, $id, $timestamp, $out); -- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); -- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); -+ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); -+ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1727,15 +1732,28 @@ sub vendor_errors - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id"; -+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs)); -+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp "; -- $out .= "Error Info:$err_info \n" if ($err_info); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "totem_id=$totem_id, " if ($totem_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "pcie_info=$pcie_info, " if ($pcie_info); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs" if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; --- -2.25.1 - diff --git a/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch deleted file mode 100644 index 2ff9537..0000000 --- a/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +++ /dev/null @@ -1,56 +0,0 @@ -From f5c3c03039be28bb6b5bbe00e12e9586b19a1060 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 16:18:55 +0000 -Subject: [PATCH 05/10] rasdaemon: ras-mc-ctl: Reformat error info of the - HiSilicon Kunpeng920 - -Reformat the code to display the error info of HiSilicon Kunpeng920. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 15 +++++++++------ - 1 file changed, 9 insertions(+), 6 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 08eb287..8755b6f 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1671,8 +1671,9 @@ sub vendor_errors - $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); - $out .= "module_id=$module_id, " if ($module_id); - $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, \n" if ($err_severity); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -@@ -1694,8 +1695,9 @@ sub vendor_errors - $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); - $out .= "module_id=$module_id, " if ($module_id); - $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, \n" if ($err_severity); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -@@ -1719,8 +1721,9 @@ sub vendor_errors - $out .= "core_id=$core_id, " if ($core_id); - $out .= "port_id=$port_id, " if ($port_id); - $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "err_type=$err_type, \n" if ($err_type); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; --- -2.25.1 - diff --git a/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch deleted file mode 100644 index 1ff38e3..0000000 --- a/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +++ /dev/null @@ -1,37 +0,0 @@ -From d595a9d61f9d8341a5e30d4d800e3237d6e0f390 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 17:01:35 +0000 -Subject: [PATCH 06/10] rasdaemon: ras-mc-ctl: Add printing usage if necessary - parameters are not passed for the vendor-error options - -Add printing usage if necessary parameters are not passed -for the vendor-errors options. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 8755b6f..959ea6b 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1544,6 +1544,7 @@ sub vendor_errors_summary - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -+ usage(1); - return; - } - -@@ -1651,6 +1652,7 @@ sub vendor_errors - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -+ usage(1); - return; - } - --- -2.25.1 - diff --git a/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch deleted file mode 100644 index 6af2ad0..0000000 --- a/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +++ /dev/null @@ -1,274 +0,0 @@ -From 0643011831e5fb4e81edff16ad55f9a5196ec7a9 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 18:19:38 +0000 -Subject: [PATCH 07/10] rasdaemon: ras-mc-ctl: Add support to display the - HiSilicon vendor errors for a specified module - -Add support to display the HiSilicon vendor errors for a specified module. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------ - 1 file changed, 87 insertions(+), 58 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 959ea6b..296eb87 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...] - --errors Shows the errors stored at the error database. - --error-count Shows the corrected and uncorrected error counts using sysfs. - --vendor-errors-summary Presents a summary of the vendor-specific logged errors. -- --vendor-errors Shows the vendor-specific errors stored in the error database. -- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. -+ --vendor-errors Shows the vendor-specific errors stored in the error database. -+ --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. -+ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors. - --help This help message. - EOF - -@@ -1535,12 +1536,14 @@ use constant { - sub vendor_errors_summary - { - require DBI; -- my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $found_platform); - my ($query, $query_handle, $count, $out); - my ($module_id, $sub_module_id, $err_severity, $err_sev); - - $num_args = $#ARGV + 1; - $platform_id = 0; -+ $found_platform = 0; -+ - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -@@ -1552,6 +1555,7 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1615,6 +1619,7 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -+ $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1636,21 +1641,31 @@ sub vendor_errors_summary - $query_handle->finish; - } - -+ if ($platform_id && !($found_platform)) { -+ print "Platform ID $platform_id is not valid\n"; -+ } -+ - undef($dbh); - } - - sub vendor_errors - { - require DBI; -- my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $found_platform, $module, $found_module); - my ($query, $query_handle, $id, $timestamp, $out); - my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); - my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); - - $num_args = $#ARGV + 1; - $platform_id = 0; -+ $found_platform = 0; -+ $module = 0; -+ $found_module = 0; - if ($num_args ne 0) { - $platform_id = $ARGV[0]; -+ if ($num_args gt 1) { -+ $module = $ARGV[1]; -+ } - } else { - usage(1); - return; -@@ -1660,27 +1675,29 @@ sub vendor_errors - - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type1 errors.\n"; - } - $query_handle->finish; - -@@ -1690,21 +1707,22 @@ sub vendor_errors - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type2 errors.\n"; - } - $query_handle->finish; - -@@ -1714,51 +1732,56 @@ sub vendor_errors - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 PCIe controller errors.\n"; - } - $query_handle->finish; - } - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -+ $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "totem_id=$totem_id, " if ($totem_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "pcie_info=$pcie_info, " if ($pcie_info); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs" if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "totem_id=$totem_id, " if ($totem_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "pcie_info=$pcie_info, " if ($pcie_info); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs" if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -@@ -1768,6 +1791,12 @@ sub vendor_errors - $query_handle->finish; - } - -+ if ($platform_id && !($found_platform)) { -+ print "Platform ID $platform_id is not valid\n"; -+ } elsif ($module && !($found_module)) { -+ print "No error record for the module $module\n"; -+ } -+ - undef($dbh); - } - --- -2.25.1 - diff --git a/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch deleted file mode 100644 index 0453e04..0000000 --- a/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +++ /dev/null @@ -1,150 +0,0 @@ -From 2f23b5dc6e5831c8ef2e179bb936e13502f75041 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Mon, 7 Mar 2022 12:38:45 +0000 -Subject: [PATCH 08/10] rasdaemon: ras-mc-ctl: Relocate reading and display - Kunpeng920 errors to under Kunpeng9xx - -Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 40 ++++++++++------------------------------ - 1 file changed, 10 insertions(+), 30 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 296eb87..75981a0 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1529,7 +1529,6 @@ sub errors - - # Definitions of the vendor platform IDs. - use constant { -- HISILICON_KUNPENG_920 => "Kunpeng920", - HISILICON_KUNPENG_9XX => "Kunpeng9xx", - }; - -@@ -1553,8 +1552,8 @@ sub vendor_errors_summary - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng920 errors -- if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx errors -+ if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); -@@ -1570,9 +1569,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n"; -+ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1590,9 +1587,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n"; -+ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1610,16 +1605,10 @@ sub vendor_errors_summary - $out .= "\t$sub_module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n"; -+ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; - } - $query_handle->finish; -- } - -- # HiSilicon Kunpeng9xx common errors -- if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1635,8 +1624,6 @@ sub vendor_errors_summary - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng9xx common errors.\n\n"; - } - $query_handle->finish; - } -@@ -1673,8 +1660,8 @@ sub vendor_errors - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng920 errors -- if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx errors -+ if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); -@@ -1697,7 +1684,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1722,7 +1709,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1749,14 +1736,10 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; - } - $query_handle->finish; -- } - -- # HiSilicon Kunpeng9xx common errors -- if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1785,8 +1768,6 @@ sub vendor_errors - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng9xx common errors.\n"; - } - $query_handle->finish; - } -@@ -1803,7 +1784,6 @@ sub vendor_errors - sub vendor_platforms - { - print "\nSupported platforms for the vendor-specific errors:\n"; -- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; - print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; - print "\n"; - } --- -2.25.1 - diff --git a/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch deleted file mode 100644 index e34f89f..0000000 --- a/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch +++ /dev/null @@ -1,127 +0,0 @@ -From df6011fed2bb45989f9e5c2ea30b33937b08d06c Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 28 Apr 2022 18:58:43 +0100 -Subject: [PATCH 09/10] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name - -Updated the HiSilicon platform name as KunPeng9xx. - -Signed-off-by: Shiju Jose ---- - util/ras-mc-ctl.in | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 75981a0..1cc19b3 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1529,7 +1529,7 @@ sub errors - - # Definitions of the vendor platform IDs. - use constant { -- HISILICON_KUNPENG_9XX => "Kunpeng9xx", -+ HISILICON_KUNPENG_9XX => "KunPeng9xx", - }; - - sub vendor_errors_summary -@@ -1552,7 +1552,7 @@ sub vendor_errors_summary - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng9xx errors -+ # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; -@@ -1569,7 +1569,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1587,7 +1587,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1605,7 +1605,7 @@ sub vendor_errors_summary - $out .= "\t$sub_module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1623,7 +1623,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx common error events summary:\n$out\n"; - } - $query_handle->finish; - } -@@ -1660,7 +1660,7 @@ sub vendor_errors - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng9xx errors -+ # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; -@@ -1684,7 +1684,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1709,7 +1709,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1736,7 +1736,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1767,7 +1767,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx common error events:\n$out\n"; - } - $query_handle->finish; - } -@@ -1784,7 +1784,7 @@ sub vendor_errors - sub vendor_platforms - { - print "\nSupported platforms for the vendor-specific errors:\n"; -- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; -+ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; - print "\n"; - } - --- -2.25.1 - diff --git a/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch deleted file mode 100644 index 48a62cc..0000000 --- a/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch +++ /dev/null @@ -1,90 +0,0 @@ -From c019f2f82b7f224e95968037f2afc16f63cc1d1d Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 28 Apr 2022 22:59:04 +0100 -Subject: [PATCH 10/10] rasdaemon: Fix for a memory out-of-bounds issue and - optimized code to remove duplicate function. - -Fixed a memory out-of-bounds issue with string pointers and -optimized code structure to remove duplicate function. - -Signed-off-by: Lei Feng -Signed-off-by: Shiju Jose ---- - non-standard-hisi_hip08.c | 6 +++--- - non-standard-hisilicon.c | 2 +- - ras-non-standard-handler.c | 16 +--------------- - 3 files changed, 5 insertions(+), 19 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 9092183..4ef47ea 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - - static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { - { -- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", -+ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", - .decode = decode_hip08_oem_type1_error, - }, - { -- .sec_type = "45534ea6ce2341158535e07ab3aef91d", -+ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", - .decode = decode_hip08_oem_type2_error, - }, - { -- .sec_type = "b2889fc9e7d74f9da867af42e98be772", -+ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", - .decode = decode_hip08_pcie_local_error, - }, - }; -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index d1e1774..6ee9271 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras, - - static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { - { -- .sec_type = "c8b328a899174af69a132e08ab2e7586", -+ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", - .decode = decode_hisi_common_section, - }, - }; -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 6d5a6f8..6932e58 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu) - return uuid; - } - --static int uuid_le_cmp(const char *sec_type, const char *uuid2) --{ -- static char uuid1[32]; -- char *p = uuid1; -- int i; -- static const unsigned char le[16] = { -- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; -- -- for (i = 0; i < 16; i++) -- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); -- *p = 0; -- return strncmp(uuid1, uuid2, 32); --} -- - int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) - { - struct ras_ns_ev_decoder *list; -@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p - - ns_ev_decoder = ras_ns_ev_dec_list; - while (ns_ev_decoder) { -- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { -+ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) { - *p_ns_ev_dec = ns_ev_decoder; - match = 1; - break; --- -2.25.1 - diff --git a/backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch b/backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch index da344f3..14d9405 100644 --- a/backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch +++ b/backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch @@ -13,7 +13,7 @@ Fix incomplete diskerror log just like block_rq_complete tracepoint output forma 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c -index b46f859..07805f7 100644 +index 638cb4d..82e8ec0 100644 --- a/ras-diskerror-handler.c +++ b/ras-diskerror-handler.c @@ -97,26 +97,32 @@ int ras_diskerror_event_handler(struct trace_seq *s, @@ -22,35 +22,35 @@ index b46f859..07805f7 100644 return -1; + trace_seq_printf(s, "%s ", ev.dev); + -+ ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1); ++ ev.rwbs = tep_get_field_raw(s, event, "rwbs", record, &len, 1); + if (!ev.rwbs) + return -1; + trace_seq_printf(s, "%s ", ev.rwbs); + -+ ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1); ++ ev.cmd = tep_get_field_raw(s, event, "cmd", record, &len, 1); + if (!ev.cmd) + return -1; + trace_seq_printf(s, "(%s) ", ev.cmd); - if (pevent_get_field_val(s, event, "sector", record, &val, 1) < 0) + if (tep_get_field_val(s, event, "sector", record, &val, 1) < 0) return -1; ev.sector = val; + trace_seq_printf(s, "%llu ", ev.sector); - if (pevent_get_field_val(s, event, "nr_sector", record, &val, 1) < 0) + if (tep_get_field_val(s, event, "nr_sector", record, &val, 1) < 0) return -1; ev.nr_sector = (unsigned int)val; + trace_seq_printf(s, "+ %u ", ev.nr_sector); - if (pevent_get_field_val(s, event, "error", record, &val, 1) < 0) + if (tep_get_field_val(s, event, "error", record, &val, 1) < 0) return -1; ev.error = get_blk_error((int)val); - -- ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1); +- ev.rwbs = tep_get_field_raw(s, event, "rwbs", record, &len, 1); - if (!ev.rwbs) - return -1; - -- ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1); +- ev.cmd = tep_get_field_raw(s, event, "cmd", record, &len, 1); - if (!ev.cmd) - return -1; + trace_seq_printf(s, "[%s]", ev.error); @@ -58,5 +58,5 @@ index b46f859..07805f7 100644 /* Insert data into the SGBD */ #ifdef HAVE_SQLITE3 -- -2.35.3 +2.30.3 diff --git a/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch b/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch deleted file mode 100644 index ed749fe..0000000 --- a/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch +++ /dev/null @@ -1,34 +0,0 @@ -From ce33041e0abfa20054ff5d6874ffbd1ab592558d Mon Sep 17 00:00:00 2001 -From: Aristeu Rozanski -Date: Thu, 19 Jan 2023 08:45:57 -0500 -Subject: [PATCH] rasdaemon: ras-memory-failure-handler: handle localtime() - failure correctly - -We could just have an empty string but keeping the format could prevent -issues if someone is actually parsing this. -Found with covscan. - -v2: fixed the timestamp as pointed by Robert Elliott - -Signed-off-by: Aristeu Rozanski -Signed-off-by: Mauro Carvalho Chehab ---- - ras-memory-failure-handler.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index 9941e68..1951456 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s, - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); -+ else -+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); - trace_seq_printf(s, "%s ", ev.timestamp); - - if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) --- -2.27.0 - diff --git a/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch b/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch deleted file mode 100644 index 4c7953c..0000000 --- a/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d Mon Sep 17 00:00:00 2001 -From: Aristeu Rozanski -Date: Thu, 19 Jan 2023 08:45:57 -0500 -Subject: [PATCH] rasdaemon: ras-report: fix possible but unlikely file - descriptor leak - -Found with covscan. - -Signed-off-by: Aristeu Rozanski -Signed-off-by: Mauro Carvalho Chehab ---- - ras-report.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/ras-report.c b/ras-report.c -index ea3a9b6..62d5eb7 100644 ---- a/ras-report.c -+++ b/ras-report.c -@@ -434,7 +434,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ - - mc_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -484,7 +484,7 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ - - aer_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -533,7 +533,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar - - non_standard_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -578,7 +578,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ - - arm_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -624,7 +624,7 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ - - mce_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -674,7 +674,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ - - devlink_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -723,7 +723,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e - done = 1; - - diskerror_fail: -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -768,7 +768,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) - done = 1; - - mf_fail: -- if (sockfd > 0) -+ if (sockfd >= 0) - close(sockfd); - - if (done) --- -2.27.0 - diff --git a/backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch b/backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch deleted file mode 100644 index bbb5e00..0000000 --- a/backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch +++ /dev/null @@ -1,128 +0,0 @@ -From 2cf6aa4bfa3b2e7efbc15741d2c0327651082223 Mon Sep 17 00:00:00 2001 -From: Tom Zanussi -Date: Thu, 2 Jul 2020 14:53:46 -0400 -Subject: tools lib traceevent: Add proper KBUFFER_TYPE_TIME_STAMP handling - -Kernel commit dc4e2801d400 (ring-buffer: Redefine the unimplemented -RINGBUF_TYPE_TIME_STAMP) changed the way the ring buffer timestamps work -- after that commit the previously unimplemented RINGBUF_TYPE_TIME_STAMP -type causes the time delta to be used as a timestamp rather than a delta -to be added to the timestamp. - -The trace-cmd code didn't get updated to handle this, so misinterprets -the event data for this case, which causes a cascade of errors, -including trace-report not being able to identify synthetic (or any -other) events generated by the histogram code (which uses TIME_STAMP -mode). For example, the following triggers along with the trace-cmd -shown cause an UNKNOWN_EVENT error and trace-cmd report crash: - - # echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > /sys/kernel/debug/tracing/synthetic_events - - # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).trace(wakeup_latency,$wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > /sys/kernel/debug/tracing/events/sched/sched_switch/trigger - # echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - - # trace-cmd record -e wakeup_latency -e sched_wakeup -f comm==\"ping\" ping localhost -c 5 - - # trace-cmd report - CPU 0 is empty - CPU 1 is empty - CPU 2 is empty - CPU 3 is empty - CPU 5 is empty - CPU 6 is empty - CPU 7 is empty - cpus=8 - ug! no event found for type 0 - [UNKNOWN TYPE 0] - ug! no event found for type 11520 - Segmentation fault (core dumped) - -After this patch we get the correct interpretation and the events are -shown properly: - - # trace-cmd report - CPU 0 is empty - CPU 1 is empty - CPU 2 is empty - CPU 3 is empty - CPU 5 is empty - CPU 6 is empty - CPU 7 is empty - cpus=8 - -0 [004] 23284.341392: sched_wakeup: ping:12031 [120] success=1 CPU:004 - -0 [004] 23284.341464: wakeup_latency: lat=58, pid=12031, comm=ping - -0 [004] 23285.365303: sched_wakeup: ping:12031 [120] success=1 CPU:004 - -0 [004] 23285.365382: wakeup_latency: lat=64, pid=12031, comm=ping - -0 [004] 23286.389290: sched_wakeup: ping:12031 [120] success=1 CPU:004 - -0 [004] 23286.389378: wakeup_latency: lat=72, pid=12031, comm=ping - -0 [004] 23287.413213: sched_wakeup: ping:12031 [120] success=1 CPU:004 - -0 [004] 23287.413291: wakeup_latency: lat=64, pid=12031, comm=ping - -Link: http://lkml.kernel.org/r/1567628224.13841.4.camel@kernel.org -Link: http://lore.kernel.org/linux-trace-devel/20200625100516.365338-3-tz.stoyanov@gmail.com - -Signed-off-by: Tom Zanussi -[ Ported from trace-cmd.git ] -Cc: Andrew Morton -Cc: Jiri Olsa -Cc: Namhyung Kim -Cc: linux-trace-devel@vger.kernel.org -Link: http://lore.kernel.org/lkml/20200702185703.785094515@goodmis.org -Signed-off-by: Tzvetomir Stoyanov (VMware) -Signed-off-by: Steven Rostedt (VMware) -Signed-off-by: Arnaldo Carvalho de Melo ---- - libtrace/kbuffer-parse.c | 15 +++++++++------ - 1 file changed, 9 insertions(+), 6 deletions(-) - -diff --git a/libtrace/kbuffer-parse.c b/libtrace/kbuffer-parse.c -index 583db99..f1640d6 100644 ---- a/libtrace/kbuffer-parse.c -+++ b/libtrace/kbuffer-parse.c -@@ -361,6 +361,7 @@ translate_data(struct kbuffer *kbuf, void *data, void **rptr, - break; - - case KBUFFER_TYPE_TIME_EXTEND: -+ case KBUFFER_TYPE_TIME_STAMP: - extend = read_4(kbuf, data); - data += 4; - extend <<= TS_SHIFT; -@@ -369,10 +370,6 @@ translate_data(struct kbuffer *kbuf, void *data, void **rptr, - *length = 0; - break; - -- case KBUFFER_TYPE_TIME_STAMP: -- data += 12; -- *length = 0; -- break; - case 0: - *length = read_4(kbuf, data) - 4; - *length = (*length + 3) & ~3; -@@ -397,7 +394,11 @@ static unsigned int update_pointers(struct kbuffer *kbuf) - - type_len = translate_data(kbuf, ptr, &ptr, &delta, &length); - -- kbuf->timestamp += delta; -+ if (type_len == KBUFFER_TYPE_TIME_STAMP) -+ kbuf->timestamp = delta; -+ else -+ kbuf->timestamp += delta; -+ - kbuf->index = calc_index(kbuf, ptr); - kbuf->next = kbuf->index + length; - -@@ -454,7 +455,9 @@ static int __next_event(struct kbuffer *kbuf) - if (kbuf->next >= kbuf->size) - return -1; - type = update_pointers(kbuf); -- } while (type == KBUFFER_TYPE_TIME_EXTEND || type == KBUFFER_TYPE_PADDING); -+ } while (type == KBUFFER_TYPE_TIME_EXTEND || -+ type == KBUFFER_TYPE_TIME_STAMP || -+ type == KBUFFER_TYPE_PADDING); - - return 0; - } --- -cgit - diff --git a/rasdaemon-0.6.8.tar.gz b/rasdaemon-0.6.8.tar.gz deleted file mode 100644 index 54e1d4f..0000000 Binary files a/rasdaemon-0.6.8.tar.gz and /dev/null differ diff --git a/rasdaemon-0.8.0.tar.gz b/rasdaemon-0.8.0.tar.gz new file mode 100644 index 0000000..131fb17 Binary files /dev/null and b/rasdaemon-0.8.0.tar.gz differ diff --git a/rasdaemon.spec b/rasdaemon.spec index 34bee66..2934e8e 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,16 +1,17 @@ Name: rasdaemon -Version: 0.6.8 -Release: 6 +Version: 0.8.0 +Release: 1 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git Source0: https://github.com/mchehab/rasdaemon/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz ExcludeArch: s390 s390x -BuildRequires: gcc, gettext-devel, perl-generators, sqlite-devel, systemd, git, libtool +BuildRequires: gcc, gettext-devel, perl-generators, sqlite-devel, systemd, git, libtool, libtraceevent-devel Provides: bundled(kernel-event-lib) Requires: hwdata Requires: perl-DBD-SQLite +Requires: libtraceevent %ifarch %{ix86} x86_64 Requires: dmidecode %endif @@ -19,32 +20,15 @@ Requires(post): systemd Requires(preun): systemd Requires(postun): systemd -Patch6000: backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch -Patch6001: backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch -Patch6002: backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch - Patch9000: bugfix-rasdaemon-wait-for-file-access.patch Patch9001: bugfix-fix-fd-check.patch Patch9002: bugfix-fix-disk-error-log-storm.patch -Patch9003: 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch -Patch9004: 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch -Patch9005: 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch -Patch9006: 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch -Patch9007: 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch -Patch9008: 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch -Patch9009: 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch -Patch9010: 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch -Patch9011: 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch -Patch9012: 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch -Patch9013: 0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch -Patch9014: 0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch -Patch9015: 0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch -Patch9016: 0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch -Patch9017: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch -Patch9018: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch -Patch9019: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch -Patch9020: backport-Check-CPUs-online-not-configured.patch -Patch9021: backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch +Patch9003: 0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch +Patch9004: 0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch +Patch9005: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch +Patch9006: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch +Patch9007: backport-Check-CPUs-online-not-configured.patch +Patch9008: backport-rasdaemon-diskerror-fix-incomplete-diskerror-log.patch %description The rasdaemon program is a daemon which monitors the platform @@ -77,7 +61,7 @@ install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl rm INSTALL %{buildroot}/usr/include/*.h %files -%doc ChangeLog README TODO +%doc ChangeLog README.md TODO %license AUTHORS COPYING %{_sbindir}/rasdaemon %{_sbindir}/ras-mc-ctl @@ -96,6 +80,12 @@ fi /usr/bin/systemctl disable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Mon Jan 29 2024 zhuofeng - 0.8.0-1 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:update version to 0.8.0 + * Sun Dec 31 2023 Lv Ying - 0.6.8-7 - Type:bugfix - ID:NA