rasdaemon: backport bugfix patches from community
1. ras-page-isolation: do_page_offline always considers page offline was successfule4d27840e12. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined againc329012ce4
This commit is contained in:
parent
dcbdd195f2
commit
d708bdf82b
@ -0,0 +1,104 @@
|
|||||||
|
From e4d27840e173491ab29c2d97017da9344e2c2526 Mon Sep 17 00:00:00 2001
|
||||||
|
From: lvying <lvying6@huawei.com>
|
||||||
|
Date: Sat, 31 Oct 2020 17:57:14 +0800
|
||||||
|
Subject: [PATCH 1/2] ras-page-isolation: do_page_offline always considers page
|
||||||
|
offline was successful
|
||||||
|
|
||||||
|
do_page_offline always consider page offline was successful even if
|
||||||
|
kernel soft/hard offline page failed.
|
||||||
|
|
||||||
|
Calling rasdaemon with:
|
||||||
|
|
||||||
|
/etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1"
|
||||||
|
|
||||||
|
i.e when a page's address occurs Corrected Error, rasdaemon should
|
||||||
|
trigger this page soft offline.
|
||||||
|
|
||||||
|
However, after adding a livepatch into kernel's
|
||||||
|
store_soft_offline_page to observe this function's return value,
|
||||||
|
when injecting a CE into address 0x3f7ec30000, the Kernel
|
||||||
|
lot reports:
|
||||||
|
|
||||||
|
soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 ()
|
||||||
|
[store_soft_offline_page]return from soft_offline_page: -5
|
||||||
|
|
||||||
|
While rasdaemon log reports:
|
||||||
|
|
||||||
|
rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold
|
||||||
|
rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined
|
||||||
|
|
||||||
|
using strace to record rasdaemon's system call, it reports:
|
||||||
|
|
||||||
|
strace -p 73711
|
||||||
|
openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page",
|
||||||
|
O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28
|
||||||
|
fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0
|
||||||
|
write(28, "0x3f7ec30000", 12) = -1 EIO (Input/output error)
|
||||||
|
close(28) = 0
|
||||||
|
|
||||||
|
So, kernel actually soft offline pfn 0x3f7ec30 failed and
|
||||||
|
store_soft_offline_page returned -EIO. However, rasdaemon always
|
||||||
|
considers the page offline to be successful.
|
||||||
|
|
||||||
|
According to strace display, ferror was unable of detecting the
|
||||||
|
failure of the write syscall.
|
||||||
|
|
||||||
|
This patch changes fopen-fprintf-ferror-fclose process to use
|
||||||
|
the lower I/O level, by using instead open-write-close, which
|
||||||
|
can detect such syscall failure.
|
||||||
|
|
||||||
|
Signed-off-by: lvying <lvying6@huawei.com>
|
||||||
|
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||||
|
---
|
||||||
|
ras-page-isolation.c | 25 ++++++++++++++++---------
|
||||||
|
1 file changed, 16 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
|
||||||
|
index 50e4406..dc07545 100644
|
||||||
|
--- a/ras-page-isolation.c
|
||||||
|
+++ b/ras-page-isolation.c
|
||||||
|
@@ -17,6 +17,9 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
+#include <sys/stat.h>
|
||||||
|
+#include <fcntl.h>
|
||||||
|
+#include <errno.h>
|
||||||
|
#include "ras-logger.h"
|
||||||
|
#include "ras-page-isolation.h"
|
||||||
|
|
||||||
|
@@ -210,18 +213,22 @@ void ras_page_account_init(void)
|
||||||
|
|
||||||
|
static int do_page_offline(unsigned long long addr, enum otype type)
|
||||||
|
{
|
||||||
|
- FILE *offline_file;
|
||||||
|
- int err;
|
||||||
|
+ int fd, rc;
|
||||||
|
+ char buf[20];
|
||||||
|
|
||||||
|
- offline_file = fopen(kernel_offline[type], "w");
|
||||||
|
- if (!offline_file)
|
||||||
|
+ fd = open(kernel_offline[type], O_WRONLY);
|
||||||
|
+ if (fd == -1) {
|
||||||
|
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]);
|
||||||
|
return -1;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
- fprintf(offline_file, "%#llx", addr);
|
||||||
|
- err = ferror(offline_file) ? -1 : 0;
|
||||||
|
- fclose(offline_file);
|
||||||
|
-
|
||||||
|
- return err;
|
||||||
|
+ sprintf(buf, "%#llx", addr);
|
||||||
|
+ rc = write(fd, buf, strlen(buf));
|
||||||
|
+ if (rc < 0) {
|
||||||
|
+ log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno);
|
||||||
|
+ }
|
||||||
|
+ close(fd);
|
||||||
|
+ return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void page_offline(struct page_record *pr)
|
||||||
|
--
|
||||||
|
2.18.4
|
||||||
|
|
||||||
@ -0,0 +1,44 @@
|
|||||||
|
From c329012ce4b44af08217f2a8f2b3b9b1b4b1c0d3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: lvying6 <lvying6@huawei.com>
|
||||||
|
Date: Sat, 31 Oct 2020 17:57:15 +0800
|
||||||
|
Subject: [PATCH 2/2] ras-page-isolation: page which is PAGE_OFFLINE_FAILED can
|
||||||
|
be offlined again
|
||||||
|
|
||||||
|
OS may fail to offline page at the previous time. After some time,
|
||||||
|
this page's state changed, and the page can be offlined by OS.
|
||||||
|
At this time, Correctable errors on this page reached the threshold.
|
||||||
|
Rasdaemon should trigger to offline this page again.
|
||||||
|
|
||||||
|
Signed-off-by: lvying6 <lvying6@huawei.com>
|
||||||
|
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||||
|
---
|
||||||
|
ras-page-isolation.c | 9 +++++++--
|
||||||
|
1 file changed, 7 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
|
||||||
|
index dc07545..fd7bd70 100644
|
||||||
|
--- a/ras-page-isolation.c
|
||||||
|
+++ b/ras-page-isolation.c
|
||||||
|
@@ -237,12 +237,17 @@ static void page_offline(struct page_record *pr)
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* Offlining page is not required */
|
||||||
|
- if (offline <= OFFLINE_ACCOUNT)
|
||||||
|
+ if (offline <= OFFLINE_ACCOUNT) {
|
||||||
|
+ log(TERM, LOG_INFO, "PAGE_CE_ACTION=%s, ignore to offline page at %#llx\n",
|
||||||
|
+ offline_choice[offline].name, addr);
|
||||||
|
return;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/* Ignore offlined pages */
|
||||||
|
- if (pr->offlined != PAGE_ONLINE)
|
||||||
|
+ if (pr->offlined == PAGE_OFFLINE) {
|
||||||
|
+ log(TERM, LOG_INFO, "page at %#llx is already offlined, ignore\n", addr);
|
||||||
|
return;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/* Time to silence this noisy page */
|
||||||
|
if (offline == OFFLINE_SOFT_THEN_HARD) {
|
||||||
|
--
|
||||||
|
2.18.4
|
||||||
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
Name: rasdaemon
|
Name: rasdaemon
|
||||||
Version: 0.6.6
|
Version: 0.6.6
|
||||||
Release: 2
|
Release: 3
|
||||||
License: GPLv2
|
License: GPLv2
|
||||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||||
URL: https://github.com/mchehab/rasdaemon.git
|
URL: https://github.com/mchehab/rasdaemon.git
|
||||||
@ -22,6 +22,8 @@ Requires(postun): systemd
|
|||||||
Patch1: bugfix-ras-events-memory-leak.patch
|
Patch1: bugfix-ras-events-memory-leak.patch
|
||||||
Patch2: bugfix-rasdaemon-wait-for-file-access.patch
|
Patch2: bugfix-rasdaemon-wait-for-file-access.patch
|
||||||
Patch3: bugfix-fix-fd-check.patch
|
Patch3: bugfix-fix-fd-check.patch
|
||||||
|
Patch4: backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch
|
||||||
|
Patch5: backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch
|
||||||
|
|
||||||
%description
|
%description
|
||||||
The rasdaemon program is a daemon which monitors the platform
|
The rasdaemon program is a daemon which monitors the platform
|
||||||
@ -68,6 +70,11 @@ rm INSTALL %{buildroot}/usr/include/*.h
|
|||||||
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
|
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Wed Mar 31 2021 Lv Ying <lvying6@huawei.com> - 0.6.6-3
|
||||||
|
- backport bugfix patches from community:
|
||||||
|
1. ras-page-isolation: do_page_offline always considers page offline was successful
|
||||||
|
2. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined again
|
||||||
|
|
||||||
* Fri Sep 25 2020 openEuler Buildteam <buildteam@openeuler.org> - 0.6.6-2
|
* Fri Sep 25 2020 openEuler Buildteam <buildteam@openeuler.org> - 0.6.6-2
|
||||||
- Update software source URL
|
- Update software source URL
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user