From 35b8410151f253f2c924ebd15feb1b601da9167c Mon Sep 17 00:00:00 2001 From: lvying Date: Sat, 26 Jan 2019 15:22:29 +0800 Subject: [PATCH] rasdaemon:mc support page isolation reason: mc support page isolation --- Makefile.am | 4 +- misc/rasdaemon.env | 29 ++++ misc/rasdaemon.service.in | 1 + ras-events.c | 4 + ras-mc-handler.c | 5 + ras-page-isolation.c | 308 ++++++++++++++++++++++++++++++++++++++ ras-page-isolation.h | 68 +++++++++ 7 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 misc/rasdaemon.env create mode 100644 ras-page-isolation.c create mode 100644 ras-page-isolation.h diff --git a/Makefile.am b/Makefile.am index 2ff742d..6fc39f2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) sbin_PROGRAMS = rasdaemon rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ - bitfield.c rbtree.c + bitfield.c rbtree.c ras-page-isolation.c if WITH_SQLITE3 rasdaemon_SOURCES += ras-record.c endif @@ -59,7 +59,7 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env new file mode 100644 index 0000000..c327353 --- /dev/null +++ b/misc/rasdaemon.env @@ -0,0 +1,29 @@ +# Page Isolation +# Note: Run-time configuration is unsupported, service restart needed. + +# Specify the threshold of isolating buggy pages. +# +# Format: +# [0-9]+[unit] +# WARNING: please make sure perfectly match this format. +# +# Supported units: +# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour +# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none +# +# The two configs will only take no effect when PAGE_CE_ACTION is "off". +PAGE_CE_REFRESH_CYCLE="24h" +PAGE_CE_THRESHOLD="50" + +# Specify the internal action in rasdaemon to exceeding a page error threshold. +# +# off no action +# account only account errors +# soft try to soft-offline page without killing any processes +# This requires an uptodate kernel. Might not be successfull. +# hard try to hard-offline page by killing processes +# Requires an uptodate kernel. Might not be successfull. +# soft-then-hard First try to soft offline, then try hard offlining. +# Note: default offline choice is "soft". +PAGE_CE_ACTION="soft" + diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in index be9ad5a..e73a08a 100644 --- a/misc/rasdaemon.service.in +++ b/misc/rasdaemon.service.in @@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events After=syslog.target [Service] +EnvironmentFile=/etc/sysconfig/rasdaemon ExecStart=@sbindir@/rasdaemon -f -r ExecStartPost=@sbindir@/rasdaemon --enable ExecStop=@sbindir@/rasdaemon --disable diff --git a/ras-events.c b/ras-events.c index 4cc2cee..70b02e5 100644 --- a/ras-events.c +++ b/ras-events.c @@ -37,6 +37,7 @@ #include "ras-diskerror-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-page-isolation.h" /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never @@ -760,6 +761,9 @@ int handle_ras_events(int record_events) ras->page_size = page_size; ras->record_events = record_events; + /* FIXME: enable memory isolation unconditionally */ + ras_page_account_init(); + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", ras_mc_event_handler, NULL, MC_EVENT); if (!rc) diff --git a/ras-mc-handler.c b/ras-mc-handler.c index deb7e05..bfbe1ef 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -23,6 +23,7 @@ #include "ras-mc-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-page-isolation.h" #include "ras-report.h" int ras_mc_event_handler(struct trace_seq *s, @@ -183,6 +184,10 @@ int ras_mc_event_handler(struct trace_seq *s, ras_store_mc_event(ras, &ev); + /* Account page corrected errors */ + if (!strcmp(ev.error_type, "Corrected")) + ras_record_page_error(ev.address, ev.error_count, now); + #ifdef HAVE_ABRT_REPORT /* Report event to ABRT */ ras_report_mc_event(ras, &ev); diff --git a/ras-page-isolation.c b/ras-page-isolation.c new file mode 100644 index 0000000..1bd04e4 --- /dev/null +++ b/ras-page-isolation.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) 2015 Yun Wu (Abel) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include +#include +#include +#include "ras-logger.h" +#include "ras-page-isolation.h" + +static const struct config threshold_units[] = { + { "m", 1000 }, + { "k", 1000 }, + { "", 1 }, + {} +}; + +static const struct config cycle_units[] = { + { "d", 24 }, + { "h", 60 }, + { "m", 60 }, + {} +}; + +static struct isolation threshold = { + .name = "PAGE_CE_THRESHOLD", + .units = threshold_units, + .env = "50", + .unit = "", +}; + +static struct isolation cycle = { + .name = "PAGE_CE_REFRESH_CYCLE", + .units = cycle_units, + .env = "24h", + .unit = "h", +}; + +static const char *kernel_offline[] = { + [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", + [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", + [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page", +}; + +static const struct config offline_choice[] = { + { "off", OFFLINE_OFF }, + { "account", OFFLINE_ACCOUNT }, + { "soft", OFFLINE_SOFT }, + { "hard", OFFLINE_HARD }, + { "soft-then-hard", OFFLINE_SOFT_THEN_HARD }, + {} +}; + +static const char *page_state[] = { + [PAGE_ONLINE] = "online", + [PAGE_OFFLINE] = "offlined", + [PAGE_OFFLINE_FAILED] = "offline-failed", +}; + +static enum otype offline = OFFLINE_SOFT; +static struct rb_root page_records; + +static void page_offline_init(void) +{ + const char *env = "PAGE_CE_ACTION"; + char *choice = getenv(env); + const struct config *c = NULL; + int matched = 0; + + if (choice) { + for (c = offline_choice; c->name; c++) { + if (!strcasecmp(choice, c->name)) { + offline = c->val; + matched = 1; + break; + } + } + } + + if (!matched) + log(TERM, LOG_INFO, "Improper %s, set to default soft\n", env); + + if (offline > OFFLINE_ACCOUNT && access(kernel_offline[offline], W_OK)) { + log(TERM, LOG_INFO, "Kernel does not support page offline interface\n"); + offline = OFFLINE_ACCOUNT; + } + + log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", + offline_choice[offline].name); +} + +static void parse_isolation_env(struct isolation *config) +{ + char *env = getenv(config->name), *unit = NULL; + const struct config *units = NULL; + unsigned long value; + int no_unit, unit_matched; + int last, i; + +reparse: + /* Start a new round */ + no_unit = unit_matched = 0; + + /* Environments could be un-configured */ + if (!env || !strlen(env)) + goto use_default; + + /* Index of the last char of environment */ + last = strlen(env) - 1; + unit = env + last; + if (isdigit(*unit)) { + unit = config->unit; + no_unit = 1; + } + + /* Only decimal digit can be accepted */ + for (i = 0; i < last; i++) { + if (!isdigit(env[i])) + goto use_default; + } + + /* Check if value is valid or not */ + if (sscanf(env, "%lu", &value) < 1 || !value) + goto use_default; + + for (units = config->units; units->name; units++) { + if (!strcasecmp(unit, units->name)) + unit_matched = 1; + if (unit_matched) + value *= units->val; + } + + /* Improper unit */ + if (!unit_matched) + goto use_default; + + config->env = env; + config->val = value; + config->unit = no_unit ? unit : ""; + return; + +use_default: + log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", + config->name, config->env); + + env = config->env; + goto reparse; +} + +static void page_isolation_init(void) +{ + /** + * It's unnecessary to parse threshold configuration when offline + * choice is off. + */ + if (offline == OFFLINE_OFF) + return; + + parse_isolation_env(&threshold); + parse_isolation_env(&cycle); + log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s%s / %s%s\n", + threshold.env, threshold.unit, cycle.env, cycle.unit); +} + +void ras_page_account_init(void) +{ + page_offline_init(); + page_isolation_init(); +} + +static int do_page_offline(unsigned long long addr, enum otype type) +{ + FILE *offline_file; + int err; + + offline_file = fopen(kernel_offline[type], "w"); + if (!offline_file) + return -1; + + fprintf(offline_file, "%#llx", addr); + err = ferror(offline_file) ? -1 : 0; + fclose(offline_file); + + return err; +} + +static void page_offline(struct page_record *pr) +{ + unsigned long long addr = pr->addr; + int ret; + + /* Offlining page is not required */ + if (offline <= OFFLINE_ACCOUNT) + return; + + /* Ignore offlined pages */ + if (pr->offlined != PAGE_ONLINE) + return; + + /* Time to silence this noisy page */ + if (offline == OFFLINE_SOFT_THEN_HARD) { + ret = do_page_offline(addr, OFFLINE_SOFT); + if (ret < 0) + ret = do_page_offline(addr, OFFLINE_HARD); + } else { + ret = do_page_offline(addr, offline); + } + + pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; + + log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", + addr, page_state[pr->offlined]); +} + +static void page_record(struct page_record *pr, unsigned count, time_t time) +{ + unsigned long period = time - pr->start; + unsigned long tolerate; + + if (period >= cycle.val) { + /** + * Since we don't refresh automatically, it is possible that the period + * between two occurences longer than the pre-configured refresh cycle. + * In this case, we tolerate the frequency of the whole period up to + * the pre-configured threshold. + */ + tolerate = (period / (double)cycle.val) * threshold.val; + pr->count -= (tolerate > pr->count) ? pr->count : tolerate; + pr->start = time; + pr->excess = 0; + } + + pr->count += count; + if (pr->count >= threshold.val) { + log(TERM, LOG_INFO, "Corrected Errors at %#llx exceed threshold\n", pr->addr); + + /** + * Backup ce count of current cycle to enable next round, which actually + * should never happen if we can disable overflow completely in the same + * time unit (but sadly we can't). + */ + pr->excess += pr->count; + pr->count = 0; + page_offline(pr); + } +} + +static struct page_record *page_lookup_insert(unsigned long long addr) +{ + struct rb_node **entry = &page_records.rb_node; + struct rb_node *parent = NULL; + struct page_record *pr = NULL, *find = NULL; + + while (*entry) { + parent = *entry; + pr = rb_entry(parent, struct page_record, entry); + if (addr == pr->addr) { + return pr; + } else if (addr < pr->addr) { + entry = &(*entry)->rb_left; + } else { + entry = &(*entry)->rb_right; + } + } + + find = calloc(1, sizeof(struct page_record)); + if (!find) { + log(TERM, LOG_ERR, "No memory for page records\n"); + return NULL; + } + + find->addr = addr; + rb_link_node(&find->entry, parent, entry); + rb_insert_color(&find->entry, &page_records); + + return find; +} + +void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) +{ + struct page_record *pr = NULL; + + if (offline == OFFLINE_OFF) + return; + + pr = page_lookup_insert(addr & PAGE_MASK); + if (pr) { + if (!pr->start) + pr->start = time; + page_record(pr, count, time); + } +} diff --git a/ras-page-isolation.h b/ras-page-isolation.h new file mode 100644 index 0000000..6aefa1e --- /dev/null +++ b/ras-page-isolation.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2015 Yun Wu (Abel) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef __RAS_PAGE_ISOLATION_H +#define __RAS_PAGE_ISOLATION_H + +#include +#include "rbtree.h" + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +struct config { + char *name; + int val; +}; + +enum otype { + OFFLINE_OFF, + OFFLINE_ACCOUNT, + OFFLINE_SOFT, + OFFLINE_HARD, + OFFLINE_SOFT_THEN_HARD, +}; + +enum pstate { + PAGE_ONLINE, + PAGE_OFFLINE, + PAGE_OFFLINE_FAILED, +}; + +struct page_record { + struct rb_node entry; + unsigned long long addr; + time_t start; + enum pstate offlined; + unsigned long count; + unsigned long excess; +}; + +struct isolation { + char *name; + char *env; + const struct config *units; + unsigned long val; + char *unit; +}; + +void ras_page_account_init(void); +void ras_record_page_error(unsigned long long addr, unsigned count, time_t time); + +#endif -- 2.19.1