Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New feature: support memory row CE threshold policy #150

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
endif
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
if WITH_AMP_NS_DECODE
rasdaemon_SOURCES += non-standard-ampere.c
endif
Expand Down
11 changes: 11 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" = "xyes"],
AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"])

AC_ARG_ENABLE([memory_row_ce_pfa],
AS_HELP_STRING([--enable-memory-row-ce-pfa], [enable memory row Corrected Error predictive failure analysis]))

AS_IF([test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_MEMORY_ROW_CE_PFA,1,"have memory row corrected error predictive failure analysis")
AC_SUBST([WITH_MEMORY_ROW_CE_PFA])
])
AM_CONDITIONAL([WITH_MEMORY_ROW_CE_PFA], [test x$enable_memory_row_ce_pfa = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_MEMORY_ROW_CE_PFA], [USE_MEMORY_ROW_CE_PFA="yes"], [USE_MEMORY_ROW_CE_PFA="no"])

AC_ARG_ENABLE([amp_ns_decode],
AS_HELP_STRING([--enable-amp-ns-decode], [enable AMP_NS_DECODE events (currently experimental)]))

Expand Down Expand Up @@ -262,6 +272,7 @@ compile time options summary
Memory Failure : $USE_MEMORY_FAILURE
CXL events : $USE_CXL
Memory CE PFA : $USE_MEMORY_CE_PFA
Memory ROW CE PFA : $USE_MEMORY_ROW_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
YITIAN RAS errors : $USE_YITIAN_NS_DECODE
Expand Down
26 changes: 26 additions & 0 deletions misc/rasdaemon.env
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,32 @@
PAGE_CE_REFRESH_CYCLE="24h"
PAGE_CE_THRESHOLD="50"

# Specify the threshold of isolating buggy memory rows.
#
# Format:
# [0-9]+[unit]
# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
#
# Supported units:
# ROW_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
# ROW_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
#
# The two configs will only take no effect when PAGE_CE_ACTION is "off".
ROW_CE_REFRESH_CYCLE="24h"
ROW_CE_THRESHOLD="50"

# Specify the internal action in rasdaemon to exceeding a row error threshold.
#
# off no action
# account only account errors
# soft try to soft-offline row without killing any processes
# This requires an uptodate kernel. Might not be successfull.
# hard try to hard-offline row by killing processes
# Requires an uptodate kernel. Might not be successfull.
# soft-then-hard First try to soft offline, then try hard offlining.
# Note: default offline choice is "off".
ROW_CE_ACTION="off"

# Specify the internal action in rasdaemon to exceeding a page error threshold.
#
# off no action
Expand Down
8 changes: 8 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,10 @@ int handle_ras_events(int record_events)
ras->page_size = page_size;
ras->record_events = record_events;

#ifdef HAVE_MEMORY_ROW_CE_PFA
ras_row_account_init();
#endif

#ifdef HAVE_MEMORY_CE_PFA
/* FIXME: enable memory isolation unconditionally */
ras_page_account_init();
Expand Down Expand Up @@ -1215,5 +1219,9 @@ int handle_ras_events(int record_events)
#ifdef HAVE_CPU_FAULT_ISOLATION
cpu_infos_free();
#endif

#ifdef HAVE_MEMORY_ROW_CE_PFA
row_record_infos_free();
#endif
return rc;
}
16 changes: 16 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <traceevent/kbuffer.h>
#include <unistd.h>

Expand Down Expand Up @@ -265,6 +266,21 @@ int ras_mc_event_handler(struct trace_seq *s,
ras_record_page_error(ev.address, ev.error_count, now);
#endif

#ifdef HAVE_MEMORY_ROW_CE_PFA
/* Account row corrected errors */
struct timespec ts;
clockid_t clk_id = CLOCK_MONOTONIC;
// A fault occurs, but the fault error_count BIOS reports sometimes is 0.
// This is a bug in the BIOS.
// We set the value to 1
// even if the error_count is reported 0.
if (ev.error_count == 0)
ev.error_count = 1;
if (clock_gettime(clk_id, &ts) == 0 && !strcmp(ev.error_type, "Corrected")) {
ras_record_row_error(ev.driver_detail, ev.error_count, ts.tv_sec, ev.address);
}
#endif

#ifdef HAVE_ABRT_REPORT
/* Report event to ABRT */
ras_report_mc_event(ras, &ev);
Expand Down
Loading