!42 rasdaemon: Update software to v0.6.7

Merge pull request !42 from xujing/master
This commit is contained in:
openeuler-ci-bot 2022-01-17 09:02:02 +00:00 committed by Gitee
commit 05c6bc35d5
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
16 changed files with 98 additions and 2474 deletions

View File

@ -1,14 +1,13 @@
From 94f9581a6b398f178fcabf0fde2cce7eebb15ea7 Mon Sep 17 00:00:00 2001
From 9e2d3f84c4f158dd58bce4a30eec568331749501 Mon Sep 17 00:00:00 2001
From: Lostwayzxc <luoshengwei@huawei.com>
Date: Tue, 25 May 2021 20:05:49 +0800
Subject: [PATCH 1/2] add cpu online fault isolation
Subject: [PATCH] add cpu online fault isolation
Add cpu online fault isolation, when CE/UCE occurs, we choose to offline
the error cpu according to threshold algorithm.
Signed-off-by: Luo Shengwei <luoshengwei@huawei.com>
---
.travis.yml | 2 +-
Makefile.am | 6 +-
configure.ac | 11 +
misc/rasdaemon.env | 17 ++
@ -19,32 +18,19 @@ Signed-off-by: Luo Shengwei <luoshengwei@huawei.com>
ras-cpu-isolation.h | 76 +++++++
ras-events.c | 8 +
ras-record.h | 5 +
11 files changed, 864 insertions(+), 2 deletions(-)
10 files changed, 863 insertions(+), 1 deletion(-)
create mode 100644 queue.c
create mode 100644 queue.h
create mode 100644 ras-cpu-isolation.c
create mode 100644 ras-cpu-isolation.h
diff --git a/.travis.yml b/.travis.yml
index 79cf4ca..5ab3957 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ before_install:
- sudo apt-get install -y sqlite3
install:
- autoreconf -vfi
-- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa
+- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa --enable-cpu-fault-isolation
script:
- make && sudo make install
diff --git a/Makefile.am b/Makefile.am
index f4822b9..6431dd3 100644
index fabca78..242ceb7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -57,12 +57,16 @@ endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
@@ -63,13 +63,17 @@ endif
if WITH_AMP_NS_DECODE
rasdaemon_SOURCES += non-standard-ampere.c
endif
+if WITH_CPU_FAULT_ISOLATION
+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
@ -54,19 +40,20 @@ index f4822b9..6431dd3 100644
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
- ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h
+ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ ras-cpu-isolation.h queue.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index 2d6c59c..a682bb9 100644
index 33b81fe..d098fcf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -141,6 +141,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"],
AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"])
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
+AC_ARG_ENABLE([cpu_fault_isolation],
+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation]))
@ -81,10 +68,10 @@ index 2d6c59c..a682bb9 100644
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
@@ -173,4 +183,5 @@ compile time options summary
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
@@ -201,4 +211,5 @@ compile time options summary
Memory Failure : $USE_MEMORY_FAILURE
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION
EOF
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
@ -295,13 +282,13 @@ index 0000000..9684c58
+#endif
\ No newline at end of file
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 2f170e2..10d0099 100644
index 1149dc6..a64f20b 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -20,6 +20,44 @@
#include "ras-record.h"
#include "ras-logger.h"
@@ -22,6 +22,44 @@
#include "ras-report.h"
#include "ras-non-standard-handler.h"
#include "non-standard-ampere.h"
+#include "ras-cpu-isolation.h"
+
+#ifdef HAVE_CPU_FAULT_ISOLATION
@ -341,11 +328,11 @@ index 2f170e2..10d0099 100644
+}
+#endif
int ras_arm_event_handler(struct trace_seq *s,
struct pevent_record *record,
@@ -78,6 +116,41 @@ int ras_arm_event_handler(struct trace_seq *s,
ev.psci_state = val;
trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
void display_raw_data(struct trace_seq *s,
const uint8_t *buf,
@@ -139,6 +177,41 @@ int ras_arm_event_handler(struct trace_seq *s,
display_raw_data(s, ev.vsei_error, ev.oem_len);
#endif
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ /* record cpu error */
@ -387,7 +374,7 @@ index 2f170e2..10d0099 100644
ras_store_arm_record(ras, &ev);
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
new file mode 100644
index 0000000..a809f91
index 0000000..6dcff70
--- /dev/null
+++ b/ras-cpu-isolation.c
@@ -0,0 +1,499 @@
@ -974,10 +961,10 @@ index 0000000..a7d3fdb
+#endif
\ No newline at end of file
diff --git a/ras-events.c b/ras-events.c
index 471d25d..31c4170 100644
index ba769d1..00938e6 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -40,6 +40,7 @@
@@ -41,6 +41,7 @@
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
@ -985,7 +972,7 @@ index 471d25d..31c4170 100644
/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
@@ -874,6 +875,10 @@ int handle_ras_events(int record_events)
@@ -879,6 +880,10 @@ int handle_ras_events(int record_events)
cpus = get_num_cpus(ras);
@ -996,7 +983,7 @@ index 471d25d..31c4170 100644
#ifdef HAVE_MCE
rc = register_mce_handler(ras, cpus);
if (rc)
@@ -990,6 +995,9 @@ err:
@@ -1005,6 +1010,9 @@ err:
}
free(ras);
}
@ -1007,13 +994,13 @@ index 471d25d..31c4170 100644
return rc;
}
diff --git a/ras-record.h b/ras-record.h
index cc217a9..b453f83 100644
index d9f7733..efaffa5 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -77,6 +77,11 @@ struct ras_arm_event {
int64_t midr;
int32_t running_state;
int32_t psci_state;
@@ -83,6 +83,11 @@ struct ras_arm_event {
uint32_t ctx_len;
const uint8_t *vsei_error;
uint32_t oem_len;
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ const char *severity;
+ const uint8_t *error_info;

View File

@ -52,9 +52,9 @@ index 549c494..33d4741 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = {
{ .name="mpidr", .type="INTEGER" },
{ .name="running_state", .type="INTEGER" },
{ .name="psci_state", .type="INTEGER" },
{ .name="err_info", .type="BLOB" },
{ .name="context_info", .type="BLOB" },
{ .name="vendor_info", .type="BLOB" },
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ { .name="severity", .type="TEXT" },
+ { .name="error_info", .type="BLOB" },
@ -63,9 +63,9 @@ index 549c494..33d4741 100644
static const struct db_table_descriptor arm_event_tab = {
@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr);
sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state);
sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state);
ev->ctx_error, ev->ctx_len, NULL);
sqlite3_bind_blob (priv->stmt_arm_record, 9,
ev->vsei_error, ev->oem_len, NULL);
+#ifdef HAVE_CPU_FAULT_ISOLATION
+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL);
+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL);

View File

@ -1,104 +0,0 @@
From e4d27840e173491ab29c2d97017da9344e2c2526 Mon Sep 17 00:00:00 2001
From: lvying <lvying6@huawei.com>
Date: Sat, 31 Oct 2020 17:57:14 +0800
Subject: [PATCH 1/2] ras-page-isolation: do_page_offline always considers page
offline was successful
do_page_offline always consider page offline was successful even if
kernel soft/hard offline page failed.
Calling rasdaemon with:
/etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1"
i.e when a page's address occurs Corrected Error, rasdaemon should
trigger this page soft offline.
However, after adding a livepatch into kernel's
store_soft_offline_page to observe this function's return value,
when injecting a CE into address 0x3f7ec30000, the Kernel
lot reports:
soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 ()
[store_soft_offline_page]return from soft_offline_page: -5
While rasdaemon log reports:
rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold
rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined
using strace to record rasdaemon's system call, it reports:
strace -p 73711
openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page",
O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28
fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0
write(28, "0x3f7ec30000", 12) = -1 EIO (Input/output error)
close(28) = 0
So, kernel actually soft offline pfn 0x3f7ec30 failed and
store_soft_offline_page returned -EIO. However, rasdaemon always
considers the page offline to be successful.
According to strace display, ferror was unable of detecting the
failure of the write syscall.
This patch changes fopen-fprintf-ferror-fclose process to use
the lower I/O level, by using instead open-write-close, which
can detect such syscall failure.
Signed-off-by: lvying <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-page-isolation.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
index 50e4406..dc07545 100644
--- a/ras-page-isolation.c
+++ b/ras-page-isolation.c
@@ -17,6 +17,9 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
#include "ras-logger.h"
#include "ras-page-isolation.h"
@@ -210,18 +213,22 @@ void ras_page_account_init(void)
static int do_page_offline(unsigned long long addr, enum otype type)
{
- FILE *offline_file;
- int err;
+ int fd, rc;
+ char buf[20];
- offline_file = fopen(kernel_offline[type], "w");
- if (!offline_file)
+ fd = open(kernel_offline[type], O_WRONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]);
return -1;
+ }
- fprintf(offline_file, "%#llx", addr);
- err = ferror(offline_file) ? -1 : 0;
- fclose(offline_file);
-
- return err;
+ sprintf(buf, "%#llx", addr);
+ rc = write(fd, buf, strlen(buf));
+ if (rc < 0) {
+ log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno);
+ }
+ close(fd);
+ return rc;
}
static void page_offline(struct page_record *pr)
--
2.18.4

View File

@ -1,785 +0,0 @@
From 1c085f983f01ec09e5b0dd67dbb8b4afa89e7300 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Mon, 10 Aug 2020 15:42:56 +0100
Subject: [PATCH] rasdaemon: Modify non-standard error decoding interface using
linked list
Replace the current non-standard error decoding interface with the
interface based on the linked list to avoid using realloc and
to improve the interface.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
non-standard-hisi_hip08.c | 114 +++++++++++++++++-----------------
non-standard-hisilicon.c | 46 +++++++-------
non-standard-hisilicon.h | 4 +-
ras-non-standard-handler.c | 122 ++++++++++++++++++++-----------------
ras-non-standard-handler.h | 13 ++--
5 files changed, 155 insertions(+), 144 deletions(-)
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
index 2197f81..ebf03e1 100644
--- a/non-standard-hisi_hip08.c
+++ b/non-standard-hisi_hip08.c
@@ -528,7 +528,7 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = {
#endif
#define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end))
-static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
+static void decode_oem_type1_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_oem_type1_err_sec *err)
{
@@ -537,26 +537,26 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
char *end = buf + HISI_BUF_LEN;
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE1_FIELD_VERSION, err->version, NULL);
if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE1_FIELD_SOC_ID,
err->soc_id, NULL);
}
if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE1_FIELD_SOCKET_ID,
err->socket_id, NULL);
}
if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE1_FIELD_NIMBUS_ID,
err->nimbus_id, NULL);
}
@@ -566,7 +566,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
err->module_id);
p += snprintf(p, end - p, "module=%s ", str);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE1_FIELD_MODULE_ID,
0, str);
}
@@ -578,7 +578,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
err->sub_module_id);
p += snprintf(p, end - p, "submodule=%s ", str);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE1_FIELD_SUB_MODULE_ID,
0, str);
}
@@ -587,7 +587,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "error_severity=%s ",
err_severity(err->err_severity));
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE1_FIELD_ERR_SEV,
0, err_severity(err->err_severity));
}
@@ -598,7 +598,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
trace_seq_printf(s, "%s\n", buf);
}
-static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab,
+static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_oem_type1_err_sec *err)
{
@@ -649,14 +649,14 @@ static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab,
*p = '\0';
}
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE1_FIELD_REGS_DUMP, 0, buf);
- step_vendor_data_tab(dec_tab, "hip08_oem_type1_event_tab");
+ step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab");
}
/* error data decoding functions */
static int decode_hip08_oem_type1_error(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
+ struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
struct ras_non_standard_event *event)
{
@@ -670,8 +670,8 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras,
}
#ifdef HAVE_SQLITE3
- if (!dec_tab->stmt_dec_record) {
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
+ if (!ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
&hip08_oem_type1_event_tab)
!= SQLITE_OK) {
trace_seq_printf(s,
@@ -680,18 +680,18 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras,
}
}
#endif
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE1_FIELD_TIMESTAMP,
0, event->timestamp);
trace_seq_printf(s, "\nHISI HIP08: OEM Type-1 Error\n");
- decode_oem_type1_err_hdr(dec_tab, s, err);
- decode_oem_type1_err_regs(dec_tab, s, err);
+ decode_oem_type1_err_hdr(ev_decoder, s, err);
+ decode_oem_type1_err_regs(ev_decoder, s, err);
return 0;
}
-static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
+static void decode_oem_type2_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_oem_type2_err_sec *err)
{
@@ -700,26 +700,26 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
char *end = buf + HISI_BUF_LEN;
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE2_FIELD_VERSION, err->version, NULL);
if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE2_FIELD_SOC_ID,
err->soc_id, NULL);
}
if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE2_FIELD_SOCKET_ID,
err->socket_id, NULL);
}
if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_OEM_TYPE2_FIELD_NIMBUS_ID,
err->nimbus_id, NULL);
}
@@ -729,7 +729,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
err->module_id);
p += snprintf(p, end - p, "module=%s ", str);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE2_FIELD_MODULE_ID,
0, str);
}
@@ -741,7 +741,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
err->sub_module_id);
p += snprintf(p, end - p, "submodule=%s ", str);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE2_FIELD_SUB_MODULE_ID,
0, str);
}
@@ -750,7 +750,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "error_severity=%s ",
err_severity(err->err_severity));
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE2_FIELD_ERR_SEV,
0, err_severity(err->err_severity));
}
@@ -761,7 +761,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
trace_seq_printf(s, "%s\n", buf);
}
-static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab,
+static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_oem_type2_err_sec *err)
{
@@ -822,13 +822,13 @@ static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab,
*p = '\0';
}
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE2_FIELD_REGS_DUMP, 0, buf);
- step_vendor_data_tab(dec_tab, "hip08_oem_type2_event_tab");
+ step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab");
}
static int decode_hip08_oem_type2_error(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
+ struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
struct ras_non_standard_event *event)
{
@@ -842,8 +842,8 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras,
}
#ifdef HAVE_SQLITE3
- if (!dec_tab->stmt_dec_record) {
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
+ if (!ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
&hip08_oem_type2_event_tab) != SQLITE_OK) {
trace_seq_printf(s,
"create sql hip08_oem_type2_event_tab fail\n");
@@ -851,18 +851,18 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras,
}
}
#endif
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_OEM_TYPE2_FIELD_TIMESTAMP,
0, event->timestamp);
trace_seq_printf(s, "\nHISI HIP08: OEM Type-2 Error\n");
- decode_oem_type2_err_hdr(dec_tab, s, err);
- decode_oem_type2_err_regs(dec_tab, s, err);
+ decode_oem_type2_err_hdr(ev_decoder, s, err);
+ decode_oem_type2_err_regs(ev_decoder, s, err);
return 0;
}
-static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
+static void decode_pcie_local_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_pcie_local_err_sec *err)
{
@@ -871,14 +871,14 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
char *end = buf + HISI_BUF_LEN;
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_VERSION,
err->version, NULL);
if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_SOC_ID,
err->soc_id, NULL);
}
@@ -886,7 +886,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_SOCKET_ID,
err->socket_id, NULL);
}
@@ -894,7 +894,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
if (err->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_NIMBUS_ID,
err->nimbus_id, NULL);
}
@@ -903,7 +903,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "submodule=%s ",
pcie_local_sub_module_name(err->sub_module_id));
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_PCIE_LOCAL_FIELD_SUB_MODULE_ID,
0, pcie_local_sub_module_name(err->sub_module_id));
}
@@ -911,7 +911,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
if (err->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "core_ID=core%d ", err->core_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_CORE_ID,
err->core_id, NULL);
}
@@ -919,7 +919,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
if (err->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "port_ID=port%d ", err->port_id);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_PORT_ID,
err->port_id, NULL);
}
@@ -928,7 +928,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "error_severity=%s ",
err_severity(err->err_severity));
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_PCIE_LOCAL_FIELD_ERR_SEV,
0, err_severity(err->err_severity));
}
@@ -936,7 +936,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE &&
IN_RANGE(p, buf, end)) {
p += snprintf(p, end - p, "error_type=0x%x ", err->err_type);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
HIP08_PCIE_LOCAL_FIELD_ERR_TYPE,
err->err_type, NULL);
}
@@ -947,7 +947,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
trace_seq_printf(s, "%s\n", buf);
}
-static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab,
+static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct hisi_pcie_local_err_sec *err)
{
@@ -972,13 +972,13 @@ static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab,
*p = '\0';
}
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_PCIE_LOCAL_FIELD_REGS_DUMP, 0, buf);
- step_vendor_data_tab(dec_tab, "hip08_pcie_local_event_tab");
+ step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab");
}
static int decode_hip08_pcie_local_error(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
+ struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
struct ras_non_standard_event *event)
{
@@ -992,8 +992,8 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
}
#ifdef HAVE_SQLITE3
- if (!dec_tab->stmt_dec_record) {
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
+ if (!ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
&hip08_pcie_local_event_tab) != SQLITE_OK) {
trace_seq_printf(s,
"create sql hip08_pcie_local_event_tab fail\n");
@@ -1001,18 +1001,18 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
}
}
#endif
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HIP08_PCIE_LOCAL_FIELD_TIMESTAMP,
0, event->timestamp);
trace_seq_printf(s, "\nHISI HIP08: PCIe local error\n");
- decode_pcie_local_err_hdr(dec_tab, s, err);
- decode_pcie_local_err_regs(dec_tab, s, err);
+ decode_pcie_local_err_hdr(ev_decoder, s, err);
+ decode_pcie_local_err_regs(ev_decoder, s, err);
return 0;
}
-struct ras_ns_dec_tab hip08_ns_oem_tab[] = {
+static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = {
{
.sec_type = "1f8161e155d641e6bd107afd1dc5f7c5",
.decode = decode_hip08_oem_type1_error,
@@ -1025,10 +1025,12 @@ struct ras_ns_dec_tab hip08_ns_oem_tab[] = {
.sec_type = "b2889fc9e7d74f9da867af42e98be772",
.decode = decode_hip08_pcie_local_error,
},
- { /* sentinel */ }
};
static void __attribute__((constructor)) hip08_init(void)
{
- register_ns_dec_tab(hip08_ns_oem_tab);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++)
+ register_ns_ev_decoder(&hip08_ns_ev_decoder[i]);
}
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
index c9e1fa9..a6f5e78 100644
--- a/non-standard-hisilicon.c
+++ b/non-standard-hisilicon.c
@@ -73,38 +73,38 @@ struct hisi_event {
};
#ifdef HAVE_SQLITE3
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
enum hisi_oem_data_type data_type,
int id, int64_t data, const char *text)
{
switch (data_type) {
case HISI_OEM_DATA_TYPE_INT:
- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
+ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data);
break;
case HISI_OEM_DATA_TYPE_INT64:
- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
+ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data);
break;
case HISI_OEM_DATA_TYPE_TEXT:
- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
+ sqlite3_bind_text(ev_decoder->stmt_dec_record, id, text, -1, NULL);
break;
}
}
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
{
int rc;
- rc = sqlite3_step(dec_tab->stmt_dec_record);
+ rc = sqlite3_step(ev_decoder->stmt_dec_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed to do %s step on sqlite: error = %d\n", name, rc);
- rc = sqlite3_reset(dec_tab->stmt_dec_record);
+ rc = sqlite3_reset(ev_decoder->stmt_dec_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed to reset %s on sqlite: error = %d\n", name, rc);
- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
+ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed to clear bindings %s on sqlite: error = %d\n",
@@ -113,12 +113,12 @@ int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
return rc;
}
#else
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
enum hisi_oem_data_type data_type,
int id, int64_t data, const char *text)
{ }
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
{
return 0;
}
@@ -197,7 +197,7 @@ static void decode_module(struct hisi_event *event, uint8_t module_id)
HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
}
-static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
+static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
const struct hisi_common_error_section *err,
struct hisi_event *event)
{
@@ -244,7 +244,7 @@ static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
}
static int decode_hisi_common_section(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
+ struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
struct ras_non_standard_event *event)
{
@@ -253,8 +253,8 @@ static int decode_hisi_common_section(struct ras_events *ras,
struct hisi_event hevent;
#ifdef HAVE_SQLITE3
- if (ras->record_events && !dec_tab->stmt_dec_record) {
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
+ if (ras->record_events && !ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
&hisi_common_section_tab) != SQLITE_OK) {
trace_seq_printf(s, "create sql hisi_common_section_tab fail\n");
return -1;
@@ -264,7 +264,7 @@ static int decode_hisi_common_section(struct ras_events *ras,
memset(&hevent, 0, sizeof(struct hisi_event));
trace_seq_printf(s, "\nHisilicon Common Error Section:\n");
- decode_hisi_common_section_hdr(dec_tab, err, &hevent);
+ decode_hisi_common_section_hdr(ev_decoder, err, &hevent);
trace_seq_printf(s, "%s\n", hevent.error_msg);
if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) {
@@ -280,28 +280,30 @@ static int decode_hisi_common_section(struct ras_events *ras,
}
if (ras->record_events) {
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HISI_COMMON_FIELD_TIMESTAMP,
0, event->timestamp);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
- step_vendor_data_tab(dec_tab, "hisi_common_section_tab");
+ step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
}
return 0;
}
-struct ras_ns_dec_tab hisi_section_ns_tab[] = {
+static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = {
{
.sec_type = "c8b328a899174af69a132e08ab2e7586",
.decode = decode_hisi_common_section,
},
- { /* sentinel */ }
};
static void __attribute__((constructor)) hisi_ns_init(void)
{
- register_ns_dec_tab(hisi_section_ns_tab);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++)
+ register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]);
}
diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h
index 1ce210a..75b911e 100644
--- a/non-standard-hisilicon.h
+++ b/non-standard-hisilicon.h
@@ -41,9 +41,9 @@ static inline char *err_severity(uint8_t err_sev)
return "unknown";
}
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
enum hisi_oem_data_type data_type,
int id, int64_t data, const char *text);
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name);
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name);
#endif
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
index d92fd42..1862335 100644
--- a/ras-non-standard-handler.c
+++ b/ras-non-standard-handler.c
@@ -22,46 +22,7 @@
#include "ras-logger.h"
#include "ras-report.h"
-static p_ns_dec_tab * ns_dec_tab;
-static size_t dec_tab_count;
-
-int register_ns_dec_tab(const p_ns_dec_tab tab)
-{
- ns_dec_tab = (p_ns_dec_tab *)realloc(ns_dec_tab,
- (dec_tab_count + 1) * sizeof(tab));
- if (ns_dec_tab == NULL) {
- printf("%s p_ns_dec_tab malloc failed", __func__);
- return -1;
- }
- ns_dec_tab[dec_tab_count] = tab;
- dec_tab_count++;
- return 0;
-}
-
-void unregister_ns_dec_tab(void)
-{
- if (ns_dec_tab) {
-#ifdef HAVE_SQLITE3
- p_ns_dec_tab dec_tab;
- int i, count;
-
- for (count = 0; count < dec_tab_count; count++) {
- dec_tab = ns_dec_tab[count];
- for (i = 0; dec_tab[i].decode; i++) {
- if (dec_tab[i].stmt_dec_record) {
- ras_mc_finalize_vendor_table(
- dec_tab[i].stmt_dec_record);
- dec_tab[i].stmt_dec_record = NULL;
- }
- }
- }
-#endif
-
- free(ns_dec_tab);
- ns_dec_tab = NULL;
- dec_tab_count = 0;
- }
-}
+static struct ras_ns_ev_decoder *ras_ns_ev_dec_list;
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
@@ -105,18 +66,75 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2)
return strncmp(uuid1, uuid2, 32);
}
+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder)
+{
+ struct ras_ns_ev_decoder *list;
+
+ if (!ns_ev_decoder)
+ return -1;
+
+ ns_ev_decoder->next = NULL;
+ ns_ev_decoder->stmt_dec_record = NULL;
+ if (!ras_ns_ev_dec_list) {
+ ras_ns_ev_dec_list = ns_ev_decoder;
+ } else {
+ list = ras_ns_ev_dec_list;
+ while (list->next)
+ list = list->next;
+ list->next = ns_ev_decoder;
+ }
+
+ return 0;
+}
+
+static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec)
+{
+ struct ras_ns_ev_decoder *ns_ev_decoder;
+ int match = 0;
+
+ ns_ev_decoder = ras_ns_ev_dec_list;
+ while (ns_ev_decoder) {
+ if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) {
+ *p_ns_ev_dec = ns_ev_decoder;
+ match = 1;
+ break;
+ }
+ ns_ev_decoder = ns_ev_decoder->next;
+ }
+
+ if (!match)
+ return -1;
+
+ return 0;
+}
+
+static void unregister_ns_ev_decoder(void)
+{
+#ifdef HAVE_SQLITE3
+ struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list;
+
+ while (ns_ev_decoder) {
+ if (ns_ev_decoder->stmt_dec_record) {
+ ras_mc_finalize_vendor_table(ns_ev_decoder->stmt_dec_record);
+ ns_ev_decoder->stmt_dec_record = NULL;
+ }
+ ns_ev_decoder = ns_ev_decoder->next;
+ }
+#endif
+ ras_ns_ev_dec_list = NULL;
+}
+
int ras_non_standard_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
{
- int len, i, line_count, count;
+ int len, i, line_count;
unsigned long long val;
struct ras_events *ras = context;
time_t now;
struct tm *tm;
struct ras_non_standard_event ev;
- p_ns_dec_tab dec_tab;
- bool dec_done = false;
+ struct ras_ns_ev_decoder *ns_ev_decoder;
/*
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
@@ -177,19 +195,9 @@ int ras_non_standard_event_handler(struct trace_seq *s,
if(!ev.error)
return -1;
- for (count = 0; count < dec_tab_count && !dec_done; count++) {
- dec_tab = ns_dec_tab[count];
- for (i = 0; dec_tab[i].decode; i++) {
- if (uuid_le_cmp(ev.sec_type,
- dec_tab[i].sec_type) == 0) {
- dec_tab[i].decode(ras, &dec_tab[i], s, &ev);
- dec_done = true;
- break;
- }
- }
- }
-
- if (!dec_done) {
+ if (!find_ns_ev_decoder(ev.sec_type, &ns_ev_decoder)) {
+ ns_ev_decoder->decode(ras, ns_ev_decoder, s, &ev);
+ } else {
len = ev.length;
i = 0;
line_count = 0;
@@ -222,5 +230,5 @@ int ras_non_standard_event_handler(struct trace_seq *s,
__attribute__((destructor))
static void ns_exit(void)
{
- unregister_ns_dec_tab();
+ unregister_ns_ev_decoder();
}
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
index 2b9bf40..57d4cb5 100644
--- a/ras-non-standard-handler.h
+++ b/ras-non-standard-handler.h
@@ -20,15 +20,16 @@
#define BIT(nr) (1UL << (nr))
#define BIT_ULL(nr) (1ULL << (nr))
-typedef struct ras_ns_dec_tab {
+struct ras_ns_ev_decoder {
+ struct ras_ns_ev_decoder *next;
const char *sec_type;
- int (*decode)(struct ras_events *ras, struct ras_ns_dec_tab *dec_tab,
+ int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s, struct ras_non_standard_event *event);
#ifdef HAVE_SQLITE3
#include <sqlite3.h>
sqlite3_stmt *stmt_dec_record;
#endif
-} *p_ns_dec_tab;
+};
int ras_non_standard_event_handler(struct trace_seq *s,
struct pevent_record *record,
@@ -37,11 +38,9 @@ int ras_non_standard_event_handler(struct trace_seq *s,
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index);
#ifdef HAVE_NON_STANDARD
-int register_ns_dec_tab(const p_ns_dec_tab tab);
-void unregister_ns_dec_tab(void);
+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder);
#else
-static inline int register_ns_dec_tab(const p_ns_dec_tab tab) { return 0; };
-static inline void unregister_ns_dec_tab(void) { return; };
+static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
#endif
#endif
--
2.33.0

View File

@ -1,63 +0,0 @@
From b98880e2cf5fd15e4261676760b719963b956a0e Mon Sep 17 00:00:00 2001
From: Xiaofei Tan <tanxiaofei@huawei.com>
Date: Mon, 27 Jul 2020 15:38:37 +0800
Subject: [PATCH 1/3] rasdaemon: delete the duplicate code about the definition
of hip08 DB fields
Delete the duplicate code about the definition of DB fields for hip08 OEM
event format1 and format2. Because the two OEM event format is the same.
Signed-off-By: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
non-standard-hisi_hip08.c | 23 +++++------------------
1 file changed, 5 insertions(+), 18 deletions(-)
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
index 8bf10c1..7fc6939 100644
--- a/non-standard-hisi_hip08.c
+++ b/non-standard-hisi_hip08.c
@@ -504,7 +504,7 @@ static char *pcie_local_sub_module_name(uint8_t id)
}
#ifdef HAVE_SQLITE3
-static const struct db_fields hip08_oem_type1_event_fields[] = {
+static const struct db_fields hip08_oem_event_fields[] = {
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
{ .name = "timestamp", .type = "TEXT" },
{ .name = "version", .type = "INTEGER" },
@@ -519,27 +519,14 @@ static const struct db_fields hip08_oem_type1_event_fields[] = {
static const struct db_table_descriptor hip08_oem_type1_event_tab = {
.name = "hip08_oem_type1_event_v2",
- .fields = hip08_oem_type1_event_fields,
- .num_fields = ARRAY_SIZE(hip08_oem_type1_event_fields),
-};
-
-static const struct db_fields hip08_oem_type2_event_fields[] = {
- { .name = "id", .type = "INTEGER PRIMARY KEY" },
- { .name = "timestamp", .type = "TEXT" },
- { .name = "version", .type = "INTEGER" },
- { .name = "soc_id", .type = "INTEGER" },
- { .name = "socket_id", .type = "INTEGER" },
- { .name = "nimbus_id", .type = "INTEGER" },
- { .name = "module_id", .type = "TEXT" },
- { .name = "sub_module_id", .type = "TEXT" },
- { .name = "err_severity", .type = "TEXT" },
- { .name = "regs_dump", .type = "TEXT" },
+ .fields = hip08_oem_event_fields,
+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields),
};
static const struct db_table_descriptor hip08_oem_type2_event_tab = {
.name = "hip08_oem_type2_event_v2",
- .fields = hip08_oem_type2_event_fields,
- .num_fields = ARRAY_SIZE(hip08_oem_type2_event_fields),
+ .fields = hip08_oem_event_fields,
+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields),
};
static const struct db_fields hip08_pcie_local_event_fields[] = {
--
2.7.4

View File

@ -1,44 +0,0 @@
From c329012ce4b44af08217f2a8f2b3b9b1b4b1c0d3 Mon Sep 17 00:00:00 2001
From: lvying6 <lvying6@huawei.com>
Date: Sat, 31 Oct 2020 17:57:15 +0800
Subject: [PATCH 2/2] ras-page-isolation: page which is PAGE_OFFLINE_FAILED can
be offlined again
OS may fail to offline page at the previous time. After some time,
this page's state changed, and the page can be offlined by OS.
At this time, Correctable errors on this page reached the threshold.
Rasdaemon should trigger to offline this page again.
Signed-off-by: lvying6 <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-page-isolation.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
index dc07545..fd7bd70 100644
--- a/ras-page-isolation.c
+++ b/ras-page-isolation.c
@@ -237,12 +237,17 @@ static void page_offline(struct page_record *pr)
int ret;
/* Offlining page is not required */
- if (offline <= OFFLINE_ACCOUNT)
+ if (offline <= OFFLINE_ACCOUNT) {
+ log(TERM, LOG_INFO, "PAGE_CE_ACTION=%s, ignore to offline page at %#llx\n",
+ offline_choice[offline].name, addr);
return;
+ }
/* Ignore offlined pages */
- if (pr->offlined != PAGE_ONLINE)
+ if (pr->offlined == PAGE_OFFLINE) {
+ log(TERM, LOG_INFO, "page at %#llx is already offlined, ignore\n", addr);
return;
+ }
/* Time to silence this noisy page */
if (offline == OFFLINE_SOFT_THEN_HARD) {
--
2.18.4

View File

@ -1,190 +0,0 @@
From 6ee76565274f31052868e970bce8768c314f6bb7 Mon Sep 17 00:00:00 2001
From: Xiaofei Tan <tanxiaofei@huawei.com>
Date: Mon, 27 Jul 2020 15:38:38 +0800
Subject: [PATCH 2/3] rasdaemon: delete the code of non-standard error decoder
for hip07
Delete the code of non-standard error decoder for hip07 that was never
used. Because the corresponding code in Linux kernel wasn't accepted.
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 2 +-
non-standard-hisi_hip07.c | 151 ----------------------------------------------
2 files changed, 1 insertion(+), 152 deletions(-)
delete mode 100644 non-standard-hisi_hip07.c
diff --git a/Makefile.am b/Makefile.am
index 51ef4de..23b4d60 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
if WITH_HISI_NS_DECODE
- rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c
+ rasdaemon_SOURCES += non-standard-hisi_hip08.c
endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
diff --git a/non-standard-hisi_hip07.c b/non-standard-hisi_hip07.c
deleted file mode 100644
index 09ddcb2..0000000
--- a/non-standard-hisi_hip07.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2017 Hisilicon Limited.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "ras-record.h"
-#include "ras-logger.h"
-#include "ras-report.h"
-#include "ras-non-standard-handler.h"
-
-/* common definitions */
-
-/* HISI SAS definitions */
-#define HISI_SAS_VALID_PA BIT(0)
-#define HISI_SAS_VALID_MB_ERR BIT(1)
-#define HISI_SAS_VALID_ERR_TYPE BIT(2)
-#define HISI_SAS_VALID_AXI_ERR_INFO BIT(3)
-
-struct hisi_sas_err_sec {
- uint64_t val_bits;
- uint64_t physical_addr;
- uint32_t mb;
- uint32_t type;
- uint32_t axi_err_info;
-};
-
-/* Common Functions */
-static char *err_bit_type(int etype)
-{
- switch (etype) {
- case 0x0: return "single-bit ecc";
- case 0x1: return "multi-bit ecc";
- }
- return "unknown error";
-}
-
-/* SAS Functions */
-static char *sas_err_type(int etype)
-{
- switch (etype) {
- case 0x0001: return "hgc_dqe ecc";
- case 0x0002: return "hgc_iost ecc";
- case 0x0004: return "hgc_itct ecc";
- case 0x0008: return "hgc_iostl ecc";
- case 0x0010: return "hgc_itctl ecc";
- case 0x0020: return "hgc_cqe ecc";
- case 0x0040: return "rxm_mem0 ecc";
- case 0x0080: return "rxm_mem1 ecc";
- case 0x0100: return "rxm_mem2 ecc";
- case 0x0200: return "rxm_mem3 ecc";
- case 0x0400: return "wp_depth";
- case 0x0800: return "iptt_slot_no_match";
- case 0x1000: return "rp_depth";
- case 0x2000: return "axi err";
- case 0x4000: return "fifo err";
- case 0x8000: return "lm_add_fetch_list";
- case 0x10000: return "hgc_abt_fetch_lm";
- }
- return "unknown error";
-}
-
-static char *sas_axi_err_type(int etype)
-{
- switch (etype) {
- case 0x0001: return "IOST_AXI_W_ERR";
- case 0x0002: return "IOST_AXI_R_ERR";
- case 0x0004: return "ITCT_AXI_W_ERR";
- case 0x0008: return "ITCT_AXI_R_ERR";
- case 0x0010: return "SATA_AXI_W_ERR";
- case 0x0020: return "SATA_AXI_R_ERR";
- case 0x0040: return "DQE_AXI_R_ERR";
- case 0x0080: return "CQE_AXI_W_ERR";
- case 0x0100: return "CQE_WINFO_FIFO";
- case 0x0200: return "CQE_MSG_FIFIO";
- case 0x0400: return "GETDQE_FIFO";
- case 0x0800: return "CMDP_FIFO";
- case 0x1000: return "AWTCTRL_FIFO";
- }
- return "unknown error";
-}
-
-static int decode_hip07_sas_error(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
- struct trace_seq *s,
- struct ras_non_standard_event *event)
-{
- char buf[1024];
- char *p = buf;
- const struct hisi_sas_err_sec *err =
- (struct hisi_sas_err_sec *)event->error;
-
- if (err->val_bits == 0) {
- trace_seq_printf(s, "%s: no valid error data\n",
- __func__);
- return -1;
- }
- p += sprintf(p, "[");
- if (err->val_bits & HISI_SAS_VALID_PA)
- p += sprintf(p, "phy addr = 0x%p: ",
- (void *)err->physical_addr);
-
- if (err->val_bits & HISI_SAS_VALID_MB_ERR)
- p += sprintf(p, "%s: ", err_bit_type(err->mb));
-
- if (err->val_bits & HISI_SAS_VALID_ERR_TYPE)
- p += sprintf(p, "error type = %s: ",
- sas_err_type(err->type));
-
- if (err->val_bits & HISI_SAS_VALID_AXI_ERR_INFO)
- p += sprintf(p, "axi error type = %s",
- sas_axi_err_type(err->axi_err_info));
-
- p += sprintf(p, "]");
-
- trace_seq_printf(s, "\nHISI HIP07: SAS error: %s\n", buf);
- return 0;
-}
-
-static int decode_hip07_hns_error(struct ras_events *ras,
- struct ras_ns_dec_tab *dec_tab,
- struct trace_seq *s,
- struct ras_non_standard_event *event)
-{
- return 0;
-}
-
-struct ras_ns_dec_tab hisi_ns_dec_tab[] = {
- {
- .sec_type = "daffd8146eba4d8c8a91bc9bbf4aa301",
- .decode = decode_hip07_sas_error,
- },
- {
- .sec_type = "fbc2d923ea7a453dab132949f5af9e53",
- .decode = decode_hip07_hns_error,
- },
- { /* sentinel */ }
-};
-
-__attribute__((constructor))
-static void hip07_init(void)
-{
- register_ns_dec_tab(hisi_ns_dec_tab);
-}
--
2.7.4

View File

@ -1,527 +0,0 @@
From 8c30a852493a6204ded59872bb3a0f0e43537713 Mon Sep 17 00:00:00 2001
From: Xiaofei Tan <tanxiaofei@huawei.com>
Date: Mon, 27 Jul 2020 15:38:39 +0800
Subject: [PATCH 3/3] rasdaemon: add support for hisilicon common section
decoder
Add a new non-standard error section, Hisilicon common section.
It is defined for the next generation SoC Kunpeng930. It also supports
Kunpeng920 and some modules of Kunpeng920 could be changed to use
this section.
We put the code to an new source file, as it supports multiple Hardware
platform. Some code of hip08 could be shared. Move them to this new file.
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 2 +-
non-standard-hisi_hip08.c | 79 +-----------
non-standard-hisilicon.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++
non-standard-hisilicon.h | 49 ++++++++
4 files changed, 358 insertions(+), 79 deletions(-)
create mode 100644 non-standard-hisilicon.c
create mode 100644 non-standard-hisilicon.h
diff --git a/Makefile.am b/Makefile.am
index 23b4d60..18d1a92 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
if WITH_HISI_NS_DECODE
- rasdaemon_SOURCES += non-standard-hisi_hip08.c
+ rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
index 7fc6939..2197f81 100644
--- a/non-standard-hisi_hip08.c
+++ b/non-standard-hisi_hip08.c
@@ -15,6 +15,7 @@
#include "ras-logger.h"
#include "ras-report.h"
#include "ras-non-standard-handler.h"
+#include "non-standard-hisilicon.h"
/* HISI OEM error definitions */
/* HISI OEM format1 error definitions */
@@ -83,11 +84,6 @@
#define HISI_PCIE_LOCAL_ERR_MISC_MAX 33
#define HISI_BUF_LEN 1024
-#define HISI_ERR_SEVERITY_NFE 0
-#define HISI_ERR_SEVERITY_FE 1
-#define HISI_ERR_SEVERITY_CE 2
-#define HISI_ERR_SEVERITY_NONE 3
-
struct hisi_oem_type1_err_sec {
uint32_t val_bits;
uint8_t version;
@@ -145,12 +141,6 @@ struct hisi_pcie_local_err_sec {
uint32_t err_misc[HISI_PCIE_LOCAL_ERR_MISC_MAX];
};
-enum hisi_oem_data_type {
- HISI_OEM_DATA_TYPE_INT,
- HISI_OEM_DATA_TYPE_INT64,
- HISI_OEM_DATA_TYPE_TEXT,
-};
-
enum {
HIP08_OEM_TYPE1_FIELD_ID,
HIP08_OEM_TYPE1_FIELD_TIMESTAMP,
@@ -199,20 +189,6 @@ struct hisi_module_info {
int sub_num;
};
-/* helper functions */
-static char *err_severity(uint8_t err_sev)
-{
- switch (err_sev) {
- case HISI_ERR_SEVERITY_NFE: return "recoverable";
- case HISI_ERR_SEVERITY_FE: return "fatal";
- case HISI_ERR_SEVERITY_CE: return "corrected";
- case HISI_ERR_SEVERITY_NONE: return "none";
- default:
- break;
- }
- return "unknown";
-}
-
static const char *pll_submodule_name[] = {
"TB_PLL0",
"TB_PLL1",
@@ -549,59 +525,6 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = {
.fields = hip08_pcie_local_event_fields,
.num_fields = ARRAY_SIZE(hip08_pcie_local_event_fields),
};
-
-static void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
- enum hisi_oem_data_type data_type,
- int id, int64_t data, const char *text)
-{
- switch (data_type) {
- case HISI_OEM_DATA_TYPE_INT:
- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
- break;
- case HISI_OEM_DATA_TYPE_INT64:
- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
- break;
- case HISI_OEM_DATA_TYPE_TEXT:
- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
- break;
- default:
- break;
- }
-}
-
-static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab,
- const char *name)
-{
- int rc;
-
- rc = sqlite3_step(dec_tab->stmt_dec_record);
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
- log(TERM, LOG_ERR,
- "Failed to do %s step on sqlite: error = %d\n", name, rc);
-
- rc = sqlite3_reset(dec_tab->stmt_dec_record);
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
- log(TERM, LOG_ERR,
- "Failed to reset %s on sqlite: error = %d\n", name, rc);
-
- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
- log(TERM, LOG_ERR,
- "Failed to clear bindings %s on sqlite: error = %d\n",
- name, rc);
-
- return rc;
-}
-#else
-static void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
- enum hisi_oem_data_type data_type,
- int id, int64_t data, const char *text)
-{ }
-
-static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name)
-{
- return 0;
-}
#endif
#define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end))
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
new file mode 100644
index 0000000..c9e1fa9
--- /dev/null
+++ b/non-standard-hisilicon.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2020 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+#include "non-standard-hisilicon.h"
+
+#define HISI_BUF_LEN 2048
+
+struct hisi_common_error_section {
+ uint32_t val_bits;
+ uint8_t version;
+ uint8_t soc_id;
+ uint8_t socket_id;
+ uint8_t totem_id;
+ uint8_t nimbus_id;
+ uint8_t subsystem_id;
+ uint8_t module_id;
+ uint8_t submodule_id;
+ uint8_t core_id;
+ uint8_t port_id;
+ uint16_t err_type;
+ struct {
+ uint8_t function;
+ uint8_t device;
+ uint16_t segment;
+ uint8_t bus;
+ uint8_t reserved[3];
+ } pcie_info;
+ uint8_t err_severity;
+ uint8_t reserved[3];
+ uint32_t reg_array_size;
+ uint32_t reg_array[];
+};
+
+enum {
+ HISI_COMMON_VALID_SOC_ID,
+ HISI_COMMON_VALID_SOCKET_ID,
+ HISI_COMMON_VALID_TOTEM_ID,
+ HISI_COMMON_VALID_NIMBUS_ID,
+ HISI_COMMON_VALID_SUBSYSTEM_ID,
+ HISI_COMMON_VALID_MODULE_ID,
+ HISI_COMMON_VALID_SUBMODULE_ID,
+ HISI_COMMON_VALID_CORE_ID,
+ HISI_COMMON_VALID_PORT_ID,
+ HISI_COMMON_VALID_ERR_TYPE,
+ HISI_COMMON_VALID_PCIE_INFO,
+ HISI_COMMON_VALID_ERR_SEVERITY,
+ HISI_COMMON_VALID_REG_ARRAY_SIZE,
+};
+
+enum {
+ HISI_COMMON_FIELD_ID,
+ HISI_COMMON_FIELD_TIMESTAMP,
+ HISI_COMMON_FIELD_ERR_INFO,
+ HISI_COMMON_FIELD_REGS_DUMP,
+};
+
+struct hisi_event {
+ char error_msg[HISI_BUF_LEN];
+ char reg_msg[HISI_BUF_LEN];
+};
+
+#ifdef HAVE_SQLITE3
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+ enum hisi_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{
+ switch (data_type) {
+ case HISI_OEM_DATA_TYPE_INT:
+ sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
+ break;
+ case HISI_OEM_DATA_TYPE_INT64:
+ sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
+ break;
+ case HISI_OEM_DATA_TYPE_TEXT:
+ sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
+ break;
+ }
+}
+
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
+{
+ int rc;
+
+ rc = sqlite3_step(dec_tab->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do %s step on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_reset(dec_tab->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to reset %s on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to clear bindings %s on sqlite: error = %d\n",
+ name, rc);
+
+ return rc;
+}
+#else
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+ enum hisi_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{ }
+
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
+{
+ return 0;
+}
+#endif
+
+#ifdef HAVE_SQLITE3
+static const struct db_fields hisi_common_section_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "err_info", .type = "TEXT" },
+ { .name = "regs_dump", .type = "TEXT" },
+};
+
+static const struct db_table_descriptor hisi_common_section_tab = {
+ .name = "hisi_common_section",
+ .fields = hisi_common_section_fields,
+ .num_fields = ARRAY_SIZE(hisi_common_section_fields),
+};
+#endif
+
+static const char* soc_desc[] = {
+ "Kunpeng916",
+ "Kunpeng920",
+ "Kunpeng930",
+};
+
+static const char* module_name[] = {
+ "MN",
+ "PLL",
+ "SLLC",
+ "AA",
+ "SIOE",
+ "POE",
+ "CPA",
+ "DISP",
+ "GIC",
+ "ITS",
+ "AVSBUS",
+ "CS",
+ "PPU",
+ "SMMU",
+ "PA",
+ "HLLC",
+ "DDRC",
+ "L3TAG",
+ "L3DATA",
+ "PCS",
+ "MATA",
+ "PCIe Local",
+ "SAS",
+ "SATA",
+ "NIC",
+ "RoCE",
+ "USB",
+ "ZIP",
+ "HPRE",
+ "SEC",
+ "RDE",
+ "MEE",
+ "HHA",
+};
+
+static const char* get_soc_desc(uint8_t soc_id)
+{
+ if (soc_id >= sizeof(soc_desc)/sizeof(char *))
+ return "unknown";
+
+ return soc_desc[soc_id];
+}
+
+static void decode_module(struct hisi_event *event, uint8_t module_id)
+{
+ if (module_id >= sizeof(module_name)/sizeof(char *))
+ HISI_SNPRINTF(event->error_msg, "module=unknown(id=%d) ", module_id);
+ else
+ HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
+}
+
+static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
+ const struct hisi_common_error_section *err,
+ struct hisi_event *event)
+{
+ HISI_SNPRINTF(event->error_msg, "[ table_version=%d", err->version);
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID))
+ HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id));
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID))
+ HISI_SNPRINTF(event->error_msg, "socket_id=%d", err->socket_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID))
+ HISI_SNPRINTF(event->error_msg, "totem_id=%d", err->totem_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID))
+ HISI_SNPRINTF(event->error_msg, "nimbus_id=%d", err->nimbus_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID))
+ HISI_SNPRINTF(event->error_msg, "subsystem_id=%d", err->subsystem_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID))
+ decode_module(event, err->module_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID))
+ HISI_SNPRINTF(event->error_msg, "submodule_id=%d", err->submodule_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID))
+ HISI_SNPRINTF(event->error_msg, "core_id=%d", err->core_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID))
+ HISI_SNPRINTF(event->error_msg, "port_id=%d", err->port_id);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE))
+ HISI_SNPRINTF(event->error_msg, "err_type=%d", err->err_type);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO))
+ HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x",
+ err->pcie_info.segment, err->pcie_info.bus,
+ err->pcie_info.device, err->pcie_info.function);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY))
+ HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity));
+
+ HISI_SNPRINTF(event->error_msg, "]");
+}
+
+static int decode_hisi_common_section(struct ras_events *ras,
+ struct ras_ns_dec_tab *dec_tab,
+ struct trace_seq *s,
+ struct ras_non_standard_event *event)
+{
+ const struct hisi_common_error_section *err =
+ (struct hisi_common_error_section *)event->error;
+ struct hisi_event hevent;
+
+#ifdef HAVE_SQLITE3
+ if (ras->record_events && !dec_tab->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
+ &hisi_common_section_tab) != SQLITE_OK) {
+ trace_seq_printf(s, "create sql hisi_common_section_tab fail\n");
+ return -1;
+ }
+ }
+#endif
+
+ memset(&hevent, 0, sizeof(struct hisi_event));
+ trace_seq_printf(s, "\nHisilicon Common Error Section:\n");
+ decode_hisi_common_section_hdr(dec_tab, err, &hevent);
+ trace_seq_printf(s, "%s\n", hevent.error_msg);
+
+ if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) {
+ int i;
+
+ trace_seq_printf(s, "Register Dump:\n");
+ for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) {
+ trace_seq_printf(s, "reg%02d=0x%08x\n", i,
+ err->reg_array[i]);
+ HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x",
+ i, err->reg_array[i]);
+ }
+ }
+
+ if (ras->record_events) {
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_TIMESTAMP,
+ 0, event->timestamp);
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
+ HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
+ step_vendor_data_tab(dec_tab, "hisi_common_section_tab");
+ }
+
+ return 0;
+}
+
+struct ras_ns_dec_tab hisi_section_ns_tab[] = {
+ {
+ .sec_type = "c8b328a899174af69a132e08ab2e7586",
+ .decode = decode_hisi_common_section,
+ },
+ { /* sentinel */ }
+};
+
+static void __attribute__((constructor)) hisi_ns_init(void)
+{
+ register_ns_dec_tab(hisi_section_ns_tab);
+}
diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h
new file mode 100644
index 0000000..1ce210a
--- /dev/null
+++ b/non-standard-hisilicon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef __NON_STANDARD_HISILICON_H
+#define __NON_STANDARD_HISILICON_H
+
+#include "ras-non-standard-handler.h"
+#include "ras-mc-handler.h"
+
+#define HISI_SNPRINTF mce_snprintf
+
+#define HISI_ERR_SEVERITY_NFE 0
+#define HISI_ERR_SEVERITY_FE 1
+#define HISI_ERR_SEVERITY_CE 2
+#define HISI_ERR_SEVERITY_NONE 3
+
+enum hisi_oem_data_type {
+ HISI_OEM_DATA_TYPE_INT,
+ HISI_OEM_DATA_TYPE_INT64,
+ HISI_OEM_DATA_TYPE_TEXT,
+};
+
+/* helper functions */
+static inline char *err_severity(uint8_t err_sev)
+{
+ switch (err_sev) {
+ case HISI_ERR_SEVERITY_NFE: return "recoverable";
+ case HISI_ERR_SEVERITY_FE: return "fatal";
+ case HISI_ERR_SEVERITY_CE: return "corrected";
+ case HISI_ERR_SEVERITY_NONE: return "none";
+ default:
+ break;
+ }
+ return "unknown";
+}
+
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
+ enum hisi_oem_data_type data_type,
+ int id, int64_t data, const char *text);
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name);
+
+#endif
--
2.7.4

View File

@ -0,0 +1,37 @@
From 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 Mon Sep 17 00:00:00 2001
From: Matt Whitlock <whitslack@users.noreply.github.com>
Date: Wed, 9 Jun 2021 10:25:18 -0400
Subject: [PATCH] configure.ac: fix SYSCONFDEFDIR default value
configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like:
# Check whether --with-sysconfdefdir was given.
if test "${with_sysconfdefdir+set}" = set; then :
withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval
else
"/etc/sysconfig"
fi
This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
configure.ac | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/configure.ac b/configure.ac
index f7d1947..33b81fe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR])
AC_ARG_WITH(sysconfdefdir,
AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]),
[SYSCONFDEFDIR=$withval],
- ["/etc/sysconfig"])
+ [SYSCONFDEFDIR=/etc/sysconfig])
AC_SUBST([SYSCONFDEFDIR])
AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database])
--
2.27.0

View File

@ -1,29 +0,0 @@
From 00115dda854f4a50681ccc6c017daa991234411b Mon Sep 17 00:00:00 2001
From: Liguang Zhang <zhangliguang@linux.alibaba.com>
Date: Mon, 10 Aug 2020 11:07:43 +0800
Subject: [PATCH] rasdaemon: Fix error print
Fix error print handle_ras_events.
Signed-off-by: Liguang Zhang <zhangliguang@linux.alibaba.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-events.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ras-events.c b/ras-events.c
index a99fd29..c797b20 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -874,7 +874,7 @@ int handle_ras_events(int record_events)
num_events++;
} else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "aer_event");
+ "ras", "extlog_mem_event");
#endif
#ifdef HAVE_DEVLINK
--
2.18.4

View File

@ -1,634 +0,0 @@
From efb2a994b1e24c1e6645ec0dee27d8b3a7deae92 Mon Sep 17 00:00:00 2001
From: Xiaofei Tan <tanxiaofei@huawei.com>
Date: Tue, 30 Nov 2021 19:50:06 +0800
Subject: [PATCH] rasdaemon: add support for memory_failure events
Add support to log the memory_failure kernel trace
events.
Example rasdaemon log and SQLite DB output for the
memory_failure event,
=================================================
rasdaemon: memory_failure_event store: 0x126ce8f8
rasdaemon: register inserted at db
<...>-785 [000] 0.000024: memory_failure_event: 2020-10-02
13:27:13 -0400 pfn=0x204000000 page_type=free buddy page
action_result=Delayed
CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp
TEXT, pfn TEXT, page_type TEXT, action_result TEXT);
INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13
-0400','0x204000000','free buddy page','Delayed');
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
.travis.yml | 2 +-
Makefile.am | 5 +-
configure.ac | 11 +++
ras-events.c | 15 +++
ras-events.h | 1 +
ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++
ras-memory-failure-handler.h | 25 +++++
ras-record.c | 70 ++++++++++++++
ras-record.h | 13 +++
ras-report.c | 68 +++++++++++++
ras-report.h | 2 +
11 files changed, 389 insertions(+), 2 deletions(-)
create mode 100644 ras-memory-failure-handler.c
create mode 100644 ras-memory-failure-handler.h
diff --git a/.travis.yml b/.travis.yml
index 5ab3957..b865e1d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ before_install:
- sudo apt-get install -y sqlite3
install:
- autoreconf -vfi
-- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa --enable-cpu-fault-isolation
+- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa --enable-cpu-fault-isolation --enable-memory-failure
script:
- make && sudo make install
diff --git a/Makefile.am b/Makefile.am
index 61dc2cc..a032352 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -48,6 +48,9 @@ endif
if WITH_DISKERROR
rasdaemon_SOURCES += ras-diskerror-handler.c
endif
+if WITH_MEMORY_FAILURE
+ rasdaemon_SOURCES += ras-memory-failure-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
@@ -66,7 +69,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
- ras-cpu-isolation.h queue.h
+ ras-cpu-isolation.h queue.h non-standard-hisilicon.h ras-memory-failure-handler.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index a682bb9..fd67be8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -111,6 +111,16 @@ AS_IF([test "x$enable_diskerror" = "xyes" || test "x$enable_all" == "xyes"], [
AM_CONDITIONAL([WITH_DISKERROR], [test x$enable_diskerror = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_DISKERROR], [USE_DISKERROR="yes"], [USE_DISKERROR="no"])
+AC_ARG_ENABLE([memory_failure],
+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+
+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
+ AC_SUBST([WITH_MEMORY_FAILURE])
+])
+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -182,6 +192,7 @@ compile time options summary
ARM events : $USE_ARM
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
+ Memory Failure : $USE_MEMORY_FAILURE
Memory CE PFA : $USE_MEMORY_CE_PFA
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
EOF
diff --git a/ras-events.c b/ras-events.c
index 31c4170..92ae2c8 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -37,6 +37,7 @@
#include "ras-extlog-handler.h"
#include "ras-devlink-handler.h"
#include "ras-diskerror-handler.h"
+#include "ras-memory-failure-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
@@ -256,6 +257,10 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -938,6 +943,16 @@ int handle_ras_events(int record_events)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler, NULL, MF_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");
diff --git a/ras-events.h b/ras-events.h
index f028741..dfd690c 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -38,6 +38,7 @@ enum {
EXTLOG_EVENT,
DEVLINK_EVENT,
DISKERROR_EVENT,
+ MF_EVENT,
NR_EVENTS
};
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
new file mode 100644
index 0000000..9941e68
--- /dev/null
+++ b/ras-memory-failure-handler.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libtrace/kbuffer.h"
+#include "ras-memory-failure-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Memory failure - various types of pages */
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
+ MF_MSG_UNKNOWN,
+};
+
+/* Action results for various types of pages */
+enum mf_action_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+/* memory failure page types */
+static const struct {
+ int type;
+ const char *page_type;
+} mf_page_type[] = {
+ { MF_MSG_KERNEL, "reserved kernel page" },
+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
+ { MF_MSG_SLAB, "kernel slab page"},
+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
+ { MF_MSG_HUGE, "huge page"},
+ { MF_MSG_FREE_HUGE, "free huge page"},
+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"},
+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"},
+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"},
+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"},
+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"},
+ { MF_MSG_DIRTY_LRU, "dirty LRU page"},
+ { MF_MSG_CLEAN_LRU, "clean LRU page"},
+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
+ { MF_MSG_BUDDY, "free buddy page"},
+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
+ { MF_MSG_DAX, "dax page"},
+ { MF_MSG_UNSPLIT_THP, "unsplit thp"},
+ { MF_MSG_UNKNOWN, "unknown page"},
+};
+
+/* memory failure action results */
+static const struct {
+ int result;
+ const char *action_result;
+} mf_action_result[] = {
+ { MF_IGNORED, "Ignored" },
+ { MF_FAILED, "Failed" },
+ { MF_DELAYED, "Delayed" },
+ { MF_RECOVERED, "Recovered" },
+};
+
+static const char *get_page_type(int page_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
+ if (mf_page_type[i].type == page_type)
+ return mf_page_type[i].page_type;
+
+ return "unknown page";
+}
+
+static const char *get_action_result(int result)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
+ if (mf_action_result[i].result == result)
+ return mf_action_result[i].action_result;
+
+ return "unknown";
+}
+
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_mf_event ev;
+
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
+ return -1;
+ sprintf(ev.pfn, "0x%llx", val);
+ trace_seq_printf(s, "pfn=0x%llx ", val);
+
+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.page_type = get_page_type(val);
+ trace_seq_printf(s, "page_type=%s ", ev.page_type);
+
+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0)
+ return -1;
+ ev.action_result = get_action_result(val);
+ trace_seq_printf(s, "action_result=%s ", ev.action_result);
+
+ /* Store data into the SQLite DB */
+#ifdef HAVE_SQLITE3
+ ras_store_mf_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_mf_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h
new file mode 100644
index 0000000..b9e9971
--- /dev/null
+++ b/ras-memory-failure-handler.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H
+#define __RAS_MEMORY_FAILURE_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
diff --git a/ras-record.c b/ras-record.c
index 33d4741..27863c7 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -506,6 +506,56 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
}
#endif
+/*
+ * Table and functions to handle ras:memory_failure
+ */
+
+#ifdef HAVE_MEMORY_FAILURE
+static const struct db_fields mf_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="pfn", .type="TEXT" },
+ { .name="page_type", .type="TEXT" },
+ { .name="action_result", .type="TEXT" },
+};
+
+static const struct db_table_descriptor mf_event_tab = {
+ .name = "memory_failure_event",
+ .fields = mf_event_fields,
+ .num_fields = ARRAY_SIZE(mf_event_fields),
+};
+
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_mf_event)
+ return 0;
+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event);
+
+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc);
+
+ rc = sqlite3_reset(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset memory_failure_event on sqlite: error = %d\n",
+ rc);
+
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
+
/*
* Generic code
*/
@@ -818,6 +868,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = ras_mc_create_table(priv, &mf_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event,
+ &mf_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
+#endif
+
ras->db_priv = priv;
return 0;
@@ -920,6 +980,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ if (priv->stmt_mf_event) {
+ rc = sqlite3_finalize(priv->stmt_mf_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize mf_event sqlite: error = %d\n",
+ cpu, rc);
+ }
+#endif
+
rc = sqlite3_close_v2(db);
if (rc != SQLITE_OK)
log(TERM, LOG_ERR,
diff --git a/ras-record.h b/ras-record.h
index b453f83..830202f 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -103,6 +103,13 @@ struct diskerror_event {
const char *cmd;
};
+struct ras_mf_event {
+ char timestamp[64];
+ char pfn[30];
+ const char *page_type;
+ const char *action_result;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -111,6 +118,7 @@ struct ras_arm_event;
struct mce_event;
struct devlink_event;
struct diskerror_event;
+struct ras_mf_event;
#ifdef HAVE_SQLITE3
@@ -140,6 +148,9 @@ struct sqlite3_priv {
#ifdef HAVE_DISKERROR
sqlite3_stmt *stmt_diskerror_event;
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ sqlite3_stmt *stmt_mf_event;
+#endif
};
struct db_fields {
@@ -166,6 +177,7 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -178,6 +190,7 @@ static inline int ras_store_non_standard_record(struct ras_events *ras, struct r
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 2710eac..ea3a9b6 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -309,6 +309,28 @@ static int set_diskerror_event_backtrace(char *buf, struct diskerror_event *ev)
return 0;
}
+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "pfn=%s\n" \
+ "page_type=%s\n" \
+ "action_result=%s\n", \
+ ev->timestamp, \
+ ev->pfn, \
+ ev->page_type, \
+ ev->action_result);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -343,6 +365,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case DISKERROR_EVENT:
rc = set_diskerror_event_backtrace(buf, (struct diskerror_event *)ev);
break;
+ case MF_EVENT:
+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
+ break;
default:
return -1;
}
@@ -708,3 +733,46 @@ diskerror_fail:
return -1;
}
}
+
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto mf_fail;
+
+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev);
+ if (rc < 0)
+ goto mf_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ sprintf(buf, "REASON=%s", "memory failure problem");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ done = 1;
+
+mf_fail:
+ if (sockfd > 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index 1d911de..e605eb1 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -38,6 +38,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
@@ -48,6 +49,7 @@ static inline int ras_report_non_standard_event(struct ras_events *ras, struct r
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--
2.33.0

View File

@ -15,7 +15,7 @@ index e73a08a..04a0489 100644
@@ -4,7 +4,7 @@ After=syslog.target
[Service]
EnvironmentFile=/etc/sysconfig/rasdaemon
EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon
-ExecStart=@sbindir@/rasdaemon -f -r
+ExecStart=@sbindir@/rasdaemon -f
ExecStartPost=@sbindir@/rasdaemon --enable

View File

@ -1,18 +0,0 @@
From d59e4d224b3271cf7a7fe53cd7c5d539b58eac32 Mon Sep 17 00:00:00 2001
From: lvying <lvying6@huawei.com>
Date: Sat, 26 Jan 2019 15:54:17 +0800
Subject: [PATCH] rasdaemon:fix ras events memory leak
reason:fix ras events memory leak
diff -uprN a/ras-events.c b/ras-events.c
--- a/ras-events.c 2018-06-22 14:20:42.880878700 +0800
+++ b/ras-events.c 2018-06-22 14:38:24.420726900 +0800
@@ -314,6 +314,7 @@ static void parse_ras_data(struct pthrea
trace_seq_init(&s);
pevent_print_event(pdata->ras->pevent, &s, &record);
trace_seq_do_printf(&s);
+ trace_seq_destroy(&s);
printf("\n");
fflush(stdout);
}

Binary file not shown.

BIN
rasdaemon-0.6.7.tar.gz Normal file

Binary file not shown.

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.6
Release: 10
Version: 0.6.7
Release: 1
License: GPLv2
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
URL: https://github.com/mchehab/rasdaemon.git
@ -19,25 +19,17 @@ Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
Patch1: bugfix-ras-events-memory-leak.patch
Patch2: bugfix-rasdaemon-wait-for-file-access.patch
Patch3: bugfix-fix-fd-check.patch
Patch4: backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch
Patch5: backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch
Patch6: backport-rasdaemon-Fix-error-print.patch
Patch7: bugfix-fix-disk-error-log-storm.patch
Patch8: backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch
Patch9: backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch
Patch10: backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch
Patch11: backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch
Patch12: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
Patch13: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
Patch14: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
Patch15: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
Patch16: 0006-add-cpu-online-fault-isolation.patch
Patch17: 0007-add-trace-print-and-add-sqlite-store.patch
Patch18: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
Patch19: backport-rasdaemon-add-support-for-memory_failure-events.patch
Patch1: bugfix-rasdaemon-wait-for-file-access.patch
Patch2: bugfix-fix-fd-check.patch
Patch3: bugfix-fix-disk-error-log-storm.patch
Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
Patch5: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
Patch6: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
Patch7: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
Patch8: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
Patch9: 0006-add-cpu-online-fault-isolation.patch
Patch10: 0007-add-trace-print-and-add-sqlite-store.patch
Patch11: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
%description
The rasdaemon program is a daemon which monitors the platform
@ -76,7 +68,6 @@ rm INSTALL %{buildroot}/usr/include/*.h
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
@ -84,14 +75,17 @@ rm INSTALL %{buildroot}/usr/include/*.h
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
%changelog
* Thurs Dec 9 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-10
* Mon Jan 17 2022 xujing<xujing99@huawei.com> - 0.6.7-1
- DESC: Update software to v0.6.7
* Thu Dec 9 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-10
- Type:feature
- ID:NA
- SUG:NA
- DESC: Enable compilation of the feature memory fault prediction based on
corrected error.
* Thurs Dec 2 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-9
* Thu Dec 2 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-9
- Type:feature
- ID:NA
- SUG:NA