diff --git a/Makefile.am b/Makefile.am index d4872a9..a64a1c9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -69,9 +69,7 @@ endif if WITH_HISI_NS_DECODE rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c endif -if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -endif +rasdaemon_SOURCES += rbtree.c ras-page-isolation.c if WITH_AMP_NS_DECODE rasdaemon_SOURCES += non-standard-ampere.c endif diff --git a/configure.ac b/configure.ac index 1059f3c..3668672 100644 --- a/configure.ac +++ b/configure.ac @@ -182,6 +182,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" = "xyes"], AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all = xyes]) AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) +AC_ARG_ENABLE([memory_row_ce_pfa], + AS_HELP_STRING([--enable-memory-row-ce-pfa], [enable memory row Corrected Error predictive failure analysis])) + +AS_IF([test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_MEMORY_ROW_CE_PFA,1,"have memory row corrected error predictive failure analysis") + AC_SUBST([WITH_MEMORY_ROW_CE_PFA]) +]) +AM_CONDITIONAL([WITH_MEMORY_ROW_CE_PFA], [test x$enable_memory_row_ce_pfa = xyes || test x$enable_all == xyes]) +AM_COND_IF([WITH_MEMORY_ROW_CE_PFA], [USE_MEMORY_ROW_CE_PFA="yes"], [USE_MEMORY_ROW_CE_PFA="no"]) + AC_ARG_ENABLE([amp_ns_decode], AS_HELP_STRING([--enable-amp-ns-decode], [enable AMP_NS_DECODE events (currently experimental)])) @@ -262,6 +272,7 @@ compile time options summary Memory Failure : $USE_MEMORY_FAILURE CXL events : $USE_CXL Memory CE PFA : $USE_MEMORY_CE_PFA + Memory ROW CE PFA : $USE_MEMORY_ROW_CE_PFA AMP RAS errors : $USE_AMP_NS_DECODE CPU fault isolation : $USE_CPU_FAULT_ISOLATION YITIAN RAS errors : $USE_YITIAN_NS_DECODE diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env index 975eafe..963aaa0 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -18,6 +18,32 @@ PAGE_CE_REFRESH_CYCLE="24h" PAGE_CE_THRESHOLD="50" +# Specify the threshold of isolating buggy memory rows. +# +# Format: +# [0-9]+[unit] +# Notice: please make sure match this format, rasdaemon will use default value for exception input cases. +# +# Supported units: +# ROW_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour +# ROW_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none +# +# The two configs will only take no effect when PAGE_CE_ACTION is "off". +ROW_CE_REFRESH_CYCLE="24h" +ROW_CE_THRESHOLD="50" + +# Specify the internal action in rasdaemon to exceeding a row error threshold. +# +# off no action +# account only account errors +# soft try to soft-offline row without killing any processes +# This requires an uptodate kernel. Might not be successfull. +# hard try to hard-offline row by killing processes +# Requires an uptodate kernel. Might not be successfull. +# soft-then-hard First try to soft offline, then try hard offlining. +# Note: default offline choice is "off". +ROW_CE_ACTION="off" + # Specify the internal action in rasdaemon to exceeding a page error threshold. # # off no action diff --git a/ras-events.c b/ras-events.c index b6e80b2..f58ccea 100644 --- a/ras-events.c +++ b/ras-events.c @@ -953,6 +953,10 @@ int handle_ras_events(int record_events) ras->page_size = page_size; ras->record_events = record_events; +#ifdef HAVE_MEMORY_ROW_CE_PFA + ras_row_account_init(); +#endif + #ifdef HAVE_MEMORY_CE_PFA /* FIXME: enable memory isolation unconditionally */ ras_page_account_init(); @@ -1215,5 +1219,9 @@ int handle_ras_events(int record_events) #ifdef HAVE_CPU_FAULT_ISOLATION cpu_infos_free(); #endif + +#ifdef HAVE_MEMORY_ROW_CE_PFA + row_record_infos_free(); +#endif return rc; } diff --git a/ras-mc-handler.c b/ras-mc-handler.c index 191c045..8f10744 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -265,6 +266,21 @@ int ras_mc_event_handler(struct trace_seq *s, ras_record_page_error(ev.address, ev.error_count, now); #endif +#ifdef HAVE_MEMORY_ROW_CE_PFA + /* Account row corrected errors */ + struct timespec ts; + clockid_t clk_id = CLOCK_MONOTONIC; + // A fault occurs, but the fault error_count BIOS reports sometimes is 0. + // This is a bug in the BIOS. + // We set the value to 1 + // even if the error_count is reported 0. + if (ev.error_count == 0) + ev.error_count = 1; + if (clock_gettime(clk_id, &ts) == 0 && !strcmp(ev.error_type, "Corrected")) { + ras_record_row_error(ev.driver_detail, ev.error_count, ts.tv_sec, ev.address); + } +#endif + #ifdef HAVE_ABRT_REPORT /* Report event to ABRT */ ras_report_mc_event(ras, &ev); diff --git a/ras-page-isolation.c b/ras-page-isolation.c index bb6b777..0ffebbf 100644 --- a/ras-page-isolation.c +++ b/ras-page-isolation.c @@ -17,6 +17,9 @@ #include "ras-page-isolation.h" #define PARSED_ENV_LEN 50 +#define ROW_ID_MAX_LEN 200 +#define SAME_PAGE_IN_ROW 200 + static const struct config threshold_units[] = { { "m", 1000 }, { "k", 1000 }, @@ -39,6 +42,13 @@ static struct isolation threshold = { .unit = "", }; +static struct isolation row_threshold = { + .name = "ROW_CE_THRESHOLD", + .units = threshold_units, + .env = "50", + .unit = "", +}; + static struct isolation cycle = { .name = "PAGE_CE_REFRESH_CYCLE", .units = cycle_units, @@ -46,6 +56,13 @@ static struct isolation cycle = { .unit = "h", }; +static struct isolation row_cycle = { + .name = "ROW_CE_REFRESH_CYCLE", + .units = cycle_units, + .env = "24h", + .unit = "h", +}; + static const char * const kernel_offline[] = { [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", @@ -68,7 +85,9 @@ static const char * const page_state[] = { }; static enum otype offline = OFFLINE_SOFT; +static enum otype row_offline_action = OFFLINE_OFF; static struct rb_root page_records; +LIST_HEAD(row_listhead, row_record) row_head; static void page_offline_init(void) { @@ -95,6 +114,11 @@ static void page_offline_init(void) offline = OFFLINE_ACCOUNT; } + if (row_offline_action != OFFLINE_OFF) { + log(TERM, LOG_INFO, "row threshold is open, so turn off page threshold\n"); + offline = OFFLINE_OFF; + } + log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", offline_choice[offline].name); } @@ -199,6 +223,63 @@ static void page_isolation_init(void) threshold_string, cycle_string); } +static void row_offline_init(void) +{ + const char *env = "ROW_CE_ACTION"; + char *choice = getenv(env); + const struct config *c = NULL; + int matched = 0; + + if (choice) { + for (c = offline_choice; c->name; c++) { + if (!strcasecmp(choice, c->name)) { + row_offline_action = c->val; + matched = 1; + break; + } + } + } + + if (!matched){ + log(TERM, LOG_INFO, "Improper %s, set to default off\n", env); + } + + if (row_offline_action > OFFLINE_ACCOUNT && access(kernel_offline[row_offline_action], W_OK)) { + log(TERM, LOG_INFO, "Kernel does not support row offline interface\n"); + row_offline_action = OFFLINE_ACCOUNT; + } + + log(TERM, LOG_INFO, "Row offline choice on Corrected Errors is %s\n", + offline_choice[row_offline_action].name); +} + +static void row_isolation_init(void) +{ + char threshold_string[PARSED_ENV_LEN]; + char cycle_string[PARSED_ENV_LEN]; + /** + * It's unnecessary to parse threshold configuration when offline + * choice is off. + */ + if (row_offline_action == OFFLINE_OFF) + return; + + parse_isolation_env(&row_threshold); + parse_isolation_env(&row_cycle); + parse_env_string(&row_threshold, threshold_string, sizeof(threshold_string)); + parse_env_string(&row_cycle, cycle_string, sizeof(cycle_string)); + log(TERM, LOG_INFO, "Threshold of memory row Corrected Errors is %s / %s\n", + threshold_string, cycle_string); +} + +void ras_row_account_init(void) +{ + row_offline_init(); + row_isolation_init(); + log(TERM, LOG_INFO, "ras_row_account_init done\n"); +} + + void ras_page_account_init(void) { page_offline_init(); @@ -338,3 +419,324 @@ void ras_record_page_error(unsigned long long addr, unsigned int count, time_t t page_record(pr, count, time); } } +/* memory page CE threshold policy ends */ + +/* memory row CE threshold policy starts */ +const struct memory_location_field apei_fields[] = { + [APEI_NODE] = {.name = "node", .anchor_str = "node:", .value_base = 10}, + [APEI_CARD] = {.name = "card", .anchor_str = "card:", .value_base = 10}, + [APEI_MODULE] = {.name = "module", .anchor_str = "module:", .value_base = 10}, + [APEI_RANK] = {.name = "rank", .anchor_str = "rank:", .value_base = 10}, + [APEI_DEVICE] = {.name = "device", .anchor_str = "device:", .value_base = 10}, + [APEI_BANK] = {.name = "bank", .anchor_str = "bank:", .value_base = 10}, + [APEI_ROW] = {.name = "row", .anchor_str = "row:", .value_base = 10}, +}; + +const struct memory_location_field dsm_fields[] = { + [DSM_ProcessorSocketId] = {.name = "ProcessorSocketId", .anchor_str = "ProcessorSocketId:", .value_base = 16}, + [DSM_MemoryControllerId] = {.name = "MemoryControllerId", .anchor_str = "MemoryControllerId:", .value_base = 16}, + [DSM_ChannelId] = {.name = "ChannelId", .anchor_str = "ChannelId:", .value_base = 16}, + [DSM_DimmSlotId] = {.name = "DimmSlotId", .anchor_str = "DimmSlotId:", .value_base = 16}, + [DSM_PhysicalRankId] = {.name = "PhysicalRankId", .anchor_str = "PhysicalRankId:", .value_base = 16}, + [DSM_ChipId] = {.name = "ChipId", .anchor_str = "ChipId:", .value_base = 16}, + [DSM_BankGroup] = {.name = "BankGroup", .anchor_str = "BankGroup:", .value_base = 16}, + [DSM_Bank] = {.name = "Bank", .anchor_str = "Bank:", .value_base = 16}, + [DSM_Row] = {.name = "Row", .anchor_str = "Row:", .value_base = 16}, +}; + +void row_record_get_id(struct row_record *rr, char *buffer) +{ + if (!rr || !buffer) + return; + + int len = 0, field_num = 0; + const struct memory_location_field *fields; + if (rr->type == GHES) { + field_num = APEI_FIELD_NUM_CONST; + fields = apei_fields; + } else { + field_num = DSM_FIELD_NUM_CONST; + fields = dsm_fields; + } + len += sprintf(buffer + len, "{"); + for (int idx = 0; idx < field_num; idx++) + { + if (idx == field_num - 1) + len += sprintf(buffer + len, "%s:%d", fields[idx].name, rr->location_fields[idx]); + else + len += sprintf(buffer + len, "%s:%d,", fields[idx].name, rr->location_fields[idx]); + } + len += sprintf(buffer + len, "}"); + buffer[len] = '\0'; +} + +bool row_record_is_same_row(struct row_record *rr1, struct row_record *rr2) +{ + if (!rr1 || !rr2 || rr1->type != rr2->type) + return false; + + int field_num = 0; + if (rr1->type == GHES) { + field_num = APEI_FIELD_NUM_CONST; + } else { + field_num = DSM_FIELD_NUM_CONST; + } + for (int idx = 0; idx < field_num; idx++) { + if (rr1->location_fields[idx] != rr2->location_fields[idx]) + return false; + } + return true; +} + +void row_record_copy(struct row_record *dst, struct row_record *src) +{ + if (!dst || !src) + return; + + for (int i = 0; i < ROW_LOCATION_FIELDS_NUM; i++) { + dst->location_fields[i] = src->location_fields[i]; + } +} + +static int parse_value(const char* str, const char *anchor_str, int value_base, int *value) { + char *start, *endptr; + int tmp; + + if (!str || !anchor_str || !value) + return 1; + + char *pos = strstr(str, anchor_str); + if (!pos) + return 1; + + errno = 0; + start = pos + strlen(anchor_str); + tmp = (int)strtol(start, &endptr, value_base); + + if (errno != 0) { + log(TERM, LOG_ERR, "parse_value error, start: %s, value_base: %d, errno: %d\n", start, value_base, errno); + return 1; + } + + if (endptr == start){ + log(TERM, LOG_ERR, "parse_value error, start: %s, value_base: %d\n", start, value_base); + return 1; + } + *value = tmp; + return 0; +} + +static int parse_row_info(const char *detail, struct row_record *r) { + const struct memory_location_field *fields = NULL; + int field_num; + + if (!detail || !r) + return 1; + + if (strstr(detail, "APEI location")) { + fields = apei_fields; + field_num = APEI_FIELD_NUM_CONST; + r->type = GHES; + } else if (strstr(detail, "ProcessorSocketId:")) { + fields = dsm_fields; + field_num = DSM_FIELD_NUM_CONST; + r->type = DSM; + } else { + return 1; + } + + for (int idx = 0; idx < field_num; idx++) { + if (parse_value(detail, fields[idx].anchor_str, fields[idx].value_base, &r->location_fields[idx])) { + log(TERM, LOG_INFO, "Cannot parse memory row info from CE detail: %s missing\n", fields[idx].name); + return 1; + } + } + return 0; +} + +static void row_offline(struct row_record *rr, time_t time) +{ + int ret; + char row_id[ROW_ID_MAX_LEN] = {0}; + + if (!rr) + return; + row_record_get_id(rr, row_id); + /* Offlining row is not required */ + if (row_offline_action <= OFFLINE_ACCOUNT) { + log(TERM, LOG_INFO, "ROW_CE_ACTION=%s, ignore to offline row at %s\n", + offline_choice[row_offline_action].name, row_id); + return; + } + + struct page_addr *page_info = NULL; + // do offline + unsigned long long addr_list[SAME_PAGE_IN_ROW]; + int addr_list_size = 0; + LIST_FOREACH(page_info, &rr->page_head, entry) { + /* Ignore offlined pages */ + if (page_info->offlined == PAGE_OFFLINE && (addr_list_size < SAME_PAGE_IN_ROW)) { + addr_list[addr_list_size++] = page_info->addr; + continue; + } + + int found = 0; + for (int i = 0; i < addr_list_size; i++) { + if (addr_list[i] == page_info->addr) { + found = 1; + break; + } + } + + if(found){ + page_info->offlined = PAGE_OFFLINE; + continue; + } + + /* Time to silence this noisy page */ + if (row_offline_action == OFFLINE_SOFT_THEN_HARD) { + ret = do_page_offline(page_info->addr, OFFLINE_SOFT); + if (ret < 0) + ret = do_page_offline(page_info->addr, OFFLINE_HARD); + } else { + ret = do_page_offline(page_info->addr, row_offline_action); + } + + page_info->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; + + log(TERM, LOG_INFO, "Result of offlining page at %#llx of row %s: %s\n", + page_info->addr, row_id, page_state[page_info->offlined ]); + + if (page_info->offlined == PAGE_OFFLINE && (addr_list_size < SAME_PAGE_IN_ROW)) + addr_list[addr_list_size++] = page_info->addr; + } +} + +static void row_record(struct row_record *rr, time_t time) +{ + if (!rr) + return; + + if (time - rr->start > row_cycle.val) { + struct page_addr *page_info = NULL, *tmp_page_info = NULL; + page_info = LIST_FIRST(&rr->page_head); + while (page_info) { + // delete exceeds row_cycle.val + if (time - page_info->start <= row_cycle.val) + break; + tmp_page_info = LIST_NEXT(page_info, entry); + rr->count -= page_info->count; + LIST_REMOVE(page_info, entry); + free(page_info); + page_info = tmp_page_info; + } + rr->start = page_info ? page_info->start : time; + } + + char row_id[ROW_ID_MAX_LEN] = {0}; + row_record_get_id(rr, row_id); + if (rr->count >= row_threshold.val) { + log(TERM, LOG_INFO, "Corrected Errors of row %s exceeded row CE threshold, count=%lu\n", row_id, rr->count); + row_offline(rr, time); + } +} + +static struct row_record *row_lookup_insert(struct row_record *r, unsigned count, unsigned long long addr, time_t time) +{ + struct row_record *rr = NULL, *new_row_record = NULL; + struct page_addr *new_page_addr = NULL, *tail_page_addr = NULL;; + int found = 0; + + if (!r) + return NULL; + // look same row record + LIST_FOREACH(rr, &row_head, entry) { + if (row_record_is_same_row(rr, r)) { + found = 1; + new_row_record = rr; + break; + } + } + + // new row + if (!found){ + new_row_record = calloc(1, sizeof(struct row_record)); + if (!new_row_record) { + log(TERM, LOG_ERR, "No memory for new row record\n"); + return NULL; + } + new_row_record->start = time; + new_row_record->count = 0; + new_row_record->type = r->type; + + LIST_INSERT_HEAD(&row_head, new_row_record, entry); + row_record_copy(new_row_record, r); + } + + // new page + new_page_addr = calloc(1, sizeof(struct page_addr)); + if (!new_page_addr) { + log(TERM, LOG_ERR, "No memory for new page addr\n"); + return NULL; + } + new_page_addr->addr = addr & PAGE_MASK; + new_page_addr->start = time; + new_page_addr->count = count; + + struct page_addr *record = NULL; + int not_empty = 0; + LIST_FOREACH(record, &new_row_record->page_head, entry) { + tail_page_addr = record; + not_empty = 1; + } + if (not_empty) + LIST_INSERT_AFTER(tail_page_addr, new_page_addr, entry); + else + LIST_INSERT_HEAD(&new_row_record->page_head, new_page_addr, entry); + + new_row_record->count += new_page_addr->count; + + return new_row_record; +} + +void ras_record_row_error(const char *detail, unsigned count, time_t time, unsigned long long addr) +{ + struct row_record *pr = NULL; + struct row_record r = {0}; + + if (row_offline_action == OFFLINE_OFF) + return; + + if (parse_row_info(detail, &r)) + return; + + pr = row_lookup_insert(&r, count, addr, time); + if (!pr){ + log(TERM, LOG_ERR, "insert CE page structure into CE row structure failed\n"); + return; + } + + row_record(pr, time); +} + +void row_record_infos_free(void) +{ + struct row_record *row_record = NULL, *tmp_row_record = NULL; + struct page_addr *page_addr = NULL, *tmp_page_addr = NULL; + + row_record = LIST_FIRST(&row_head); + while (row_record) { + page_addr = LIST_FIRST(&row_record->page_head); + while (page_addr) { + tmp_page_addr = LIST_NEXT(page_addr, entry); + LIST_REMOVE(page_addr, entry); + free(page_addr); + page_addr = tmp_page_addr; + } + tmp_row_record = LIST_NEXT(row_record, entry); + LIST_REMOVE(row_record, entry); + free(row_record); + row_record = tmp_row_record; + } +} +/* memory row CE threshold policy ends */ diff --git a/ras-page-isolation.h b/ras-page-isolation.h index 73c9157..d5fd3e7 100644 --- a/ras-page-isolation.h +++ b/ras-page-isolation.h @@ -7,6 +7,8 @@ #ifndef __RAS_PAGE_ISOLATION_H #define __RAS_PAGE_ISOLATION_H +#include +#include #include #include @@ -45,6 +47,63 @@ struct page_record { unsigned long excess; }; +enum row_location_type{ + GHES, + DSM +}; +enum apei_location_field_index { + APEI_NODE, + APEI_CARD, + APEI_MODULE, + APEI_RANK, + APEI_DEVICE, + APEI_BANK, + APEI_ROW, + APEI_FIELD_NUM +}; + +enum dsm_location_field_index { + DSM_ProcessorSocketId, + DSM_MemoryControllerId, + DSM_ChannelId, + DSM_DimmSlotId, + DSM_PhysicalRankId, + DSM_ChipId, + DSM_BankGroup, + DSM_Bank, + DSM_Row, + DSM_FIELD_NUM +}; + +#define APEI_FIELD_NUM_CONST (int)APEI_FIELD_NUM +#define DSM_FIELD_NUM_CONST (int)DSM_FIELD_NUM + +struct memory_location_field { + const char *name; + const char *anchor_str; + const int value_base; +}; +extern const struct memory_location_field apei_fields[]; +extern const struct memory_location_field dsm_fields[]; + +struct page_addr { + LIST_ENTRY(page_addr) entry; + unsigned long long addr; + enum pstate offlined; + int count; + time_t start; +}; + +#define ROW_LOCATION_FIELDS_NUM (DSM_FIELD_NUM_CONST > DSM_FIELD_NUM_CONST ? DSM_FIELD_NUM_CONST : APEI_FIELD_NUM_CONST) +struct row_record { + LIST_ENTRY(row_record) entry; + LIST_HEAD(page_listhead, page_addr) page_head; + enum row_location_type type; + int location_fields[ROW_LOCATION_FIELDS_NUM]; + time_t start; + unsigned long count; +}; + struct isolation { char *name; char *env; @@ -57,5 +116,13 @@ struct isolation { void ras_page_account_init(void); void ras_record_page_error(unsigned long long addr, unsigned int count, time_t time); +void ras_row_account_init(void); +void ras_record_row_error(const char *detail, unsigned count, time_t time, unsigned long long addr); + +void row_record_get_id(struct row_record *rr, char *buffer); +bool row_record_is_same_row(struct row_record *rr1, struct row_record *rr2); +void row_record_copy(struct row_record *dst, struct row_record *src); +void row_record_free(struct row_record *rr); +void row_record_infos_free(void); #endif