From 93d2518d8dd29b83106c722bb2a3e4c9597ae8eb Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 23 Nov 2023 17:47:25 +0800 Subject: [PATCH] rasdaemon: add mc_event trigger Allow users to run a trigger when RAS mc_event occurs, The mc_event trigger is separated into CE trigger and UE trigger, this is because CE is more frequent than UE, and the CE trigger will lead to more performance hits. Users can choose different triggers for CE/UE to reduce this effect. Users can config trigger in /etc/sysconfig/rasdaemon: TRIGGER_DIR: The trigger diretory MC_CE_TRIGGER: The script executed when corrected error occurs. MC_UE_TRIGGER: The script executed when uncorrected error occurs. No script will be executed if MC_CE_TRIGGER/MC_UE_TRIGGER is null. Signed-off-by: Ruidong Tian --- Makefile.am | 6 +-- contrib/mc_event_trigger | 24 ++++++++++++ misc/rasdaemon.env | 18 ++++++++- ras-events.c | 17 +++++++++ ras-mc-handler.c | 81 ++++++++++++++++++++++++++++++++++++++++ ras-mc-handler.h | 2 + trigger.c | 61 ++++++++++++++++++++++++++++++ trigger.h | 13 +++++++ 8 files changed, 217 insertions(+), 5 deletions(-) create mode 100755 contrib/mc_event_trigger create mode 100644 trigger.c create mode 100644 trigger.h diff --git a/Makefile.am b/Makefile.am index 9dd42c9..8b35cc5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES) sbin_PROGRAMS = rasdaemon rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ - bitfield.c + bitfield.c trigger.c if WITH_SQLITE3 rasdaemon_SOURCES += ras-record.c endif @@ -93,7 +93,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ - non-standard-jaguarmicro.h + non-standard-jaguarmicro.h trigger.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that @@ -120,6 +120,4 @@ upload: # custom target install-data-local: $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" -if WITH_MEMORY_CE_PFA $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" -endif diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger new file mode 100755 index 0000000..5c6ccfa --- /dev/null +++ b/contrib/mc_event_trigger @@ -0,0 +1,24 @@ +#!/bin/sh +# This shell script can be executed by rasdaemon in daemon mode when a +# mc_event is occured, environment variables include all information +# reported by tracepoint. +# +# environment: +# TIMESTAMP Timestamp when error occurred +# COUNT Number of errors of the same type +# TYPE Error type from Corrected/Uncorrected +# MESSAGE Error message +# LABEL Label of the affected DIMM(s) +# MC_INDEX DIMM identifier from DMI/SMBIOS if available +# TOP_LAYER Top layer of the error +# MIDDLE_LAYER Middle layer of the error +# LOWER_LAYER Low layer of the error +# ADDRESS Error address +# GRAIN Minimum granularity for an error report, in bytes +# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) +# DRIVER_DETAIL Other driver-specific detail about the error +# + +[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local + +exit 0 diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env index 7cb18e8..3389a73 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18" CPU_ISOLATION_CYCLE="24h" # Prevent excessive isolation from causing an avalanche effect -CPU_ISOLATION_LIMIT="10" \ No newline at end of file +CPU_ISOLATION_LIMIT="10" + +# Event Trigger + +# Event trigger will be executed when the specified event occurs. +# +# Execute triggers path +# For example: TRIGGER_DIR=/etc/ras/triggers +TRIGGER_DIR= + +# Execute these triggers when the mc_event occured, the triggers will not +# be executed if the trigger is not specified. +# For example: +# MC_CE_TRIGGER=mc_event_trigger +# MC_UE_TRIGGER=mc_event_trigger +MC_CE_TRIGGER= +MC_UE_TRIGGER= diff --git a/ras-events.c b/ras-events.c index a097238..4c8dd8c 100644 --- a/ras-events.c +++ b/ras-events.c @@ -45,6 +45,7 @@ #include "ras-logger.h" #include "ras-page-isolation.h" #include "ras-cpu-isolation.h" +#include "trigger.h" /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never @@ -62,6 +63,10 @@ extern char *choices_disable; +const static struct event_trigger event_triggers[] = { + { "mc_event", &mc_event_trigger_setup }, +}; + static int get_debugfs_dir(char *tracing_dir, size_t len) { FILE *fp; @@ -277,6 +282,16 @@ int toggle_ras_mc_event(int enable) return rc; } +static void setup_event_trigger(char *event) +{ + struct event_trigger trigger; + for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) { + trigger = event_triggers[i]; + if (!strcmp(event, trigger.name)) + trigger.setup(); + } +} + #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0) /* * Set kernel filter. libtrace doesn't provide an API for setting filters @@ -871,6 +886,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, return EINVAL; } + setup_event_trigger(event); + log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event); return 0; diff --git a/ras-mc-handler.c b/ras-mc-handler.c index d93ba57..2f06a01 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -15,16 +15,91 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#define _GNU_SOURCE #include #include #include #include #include +#include #include "ras-mc-handler.h" #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" #include "ras-report.h" +#include "trigger.h" + +#define MAX_ENV 30 +static char *mc_ce_trigger; +static char *mc_ue_trigger; + +void mc_event_trigger_setup(void) +{ + mc_ce_trigger = getenv("MC_CE_TRIGGER"); + if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "") + || trigger_check(mc_ce_trigger) < 0) { + log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n", + mc_ce_trigger); + } else + log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n", + mc_ce_trigger); + + mc_ue_trigger = getenv("MC_UE_TRIGGER"); + if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "") + || trigger_check(mc_ue_trigger) < 0) { + log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n", + mc_ue_trigger); + } else + log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n", + mc_ue_trigger); +} + +static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) +{ + char *env[MAX_ENV]; + int ei = 0; + int i; + + if (!mc_trigger || !strcmp(mc_trigger, "")) + return; + + if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) + goto free; + if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) + goto free; + if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) + goto free; + if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) + goto free; + if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) + goto free; + if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) + goto free; + if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) + goto free; + if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) + goto free; + if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) + goto free; + if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) + goto free; + if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) + goto free; + if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) + goto free; + if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) + goto free; + if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) + goto free; + env[ei] = NULL; + assert(ei < MAX_ENV); + + run_trigger(mc_trigger, NULL, env, "mc_event"); + +free: + for (i = 0; i < ei; i++) + free(env[i]); +} int ras_mc_event_handler(struct trace_seq *s, struct tep_record *record, @@ -194,6 +269,12 @@ int ras_mc_event_handler(struct trace_seq *s, ras_report_mc_event(ras, &ev); #endif + if (!strcmp(ev.error_type, "Corrected")) + run_mc_trigger(&ev, mc_ce_trigger); + + if (!strcmp(ev.error_type, "Uncorrected")) + run_mc_trigger(&ev, mc_ue_trigger); + return 0; parse_error: diff --git a/ras-mc-handler.h b/ras-mc-handler.h index afc0005..a7637b2 100644 --- a/ras-mc-handler.h +++ b/ras-mc-handler.h @@ -22,6 +22,8 @@ #include "ras-events.h" #include +void mc_event_trigger_setup(void); + int ras_mc_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); diff --git a/trigger.c b/trigger.c new file mode 100644 index 0000000..48c88ea --- /dev/null +++ b/trigger.c @@ -0,0 +1,61 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "ras-logger.h" +#include "trigger.h" + +void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter) +{ + pid_t child; + char *path; + int status; + char *trigger_dir = getenv("TRIGGER_DIR"); + + + log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); + + if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) + return; + + child = fork(); + if (child < 0) { + log(SYSLOG, LOG_ERR, "Cannot create process for trigger"); + return; + } + + if (child == 0) { + execve(path, argv, env); + _exit(127); + } else { + waitpid(child, &status, 0); + if (WIFEXITED(status) && WEXITSTATUS(status)) { + log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d", + trigger, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d", + trigger, WTERMSIG(status)); + } + } +} + +int trigger_check(char *s) +{ + char *name; + int rc; + char *trigger_dir = getenv("TRIGGER_DIR"); + + if (trigger_dir) { + if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) + return -1; + } else + name = s; + + rc = access(name, R_OK|X_OK); + + if (trigger_dir) + free(name); + + return rc; +} diff --git a/trigger.h b/trigger.h new file mode 100644 index 0000000..556a7f2 --- /dev/null +++ b/trigger.h @@ -0,0 +1,13 @@ +#ifndef __TRIGGER_H__ +#define __TRIGGER_H__ + +struct event_trigger { + const char *name; + void (*setup)(void); +}; + +int trigger_check(char *s); +void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter); + + +#endif