diff --git a/Makefile.am b/Makefile.am index 37a1283..37ed0c1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,9 @@ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ if WITH_SQLITE3 rasdaemon_SOURCES += ras-record.c endif +if WITH_BROADCAST + rasdaemon_SOURCES += ras-server.c +endif if WITH_AER rasdaemon_SOURCES += ras-aer-handler.c endif diff --git a/configure.ac b/configure.ac index 318b1e6..5477cb7 100644 --- a/configure.ac +++ b/configure.ac @@ -39,6 +39,16 @@ AM_COND_IF([WITH_SQLITE3], [USE_SQLITE3="yes"], [USE_SQLITE3="no"]) AC_SUBST([SQLITE3_LIBS]) +AC_ARG_ENABLE([broadcast], + AS_HELP_STRING([--enable-broadcast], [enable broadcast of events using local sockets (currently experimental)])) + +AS_IF([test "x$enable_broadcast" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_BROADCAST,1,"broadcast RAS events") + AC_SUBST([WITH_BROADCAST]) +]) + +AM_CONDITIONAL([WITH_BROADCAST], [test x$enable_broadcast = xyes || test x$enable_all == xyes]) +AM_COND_IF([WITH_BROADCAST], [USE_BROADCAST="yes"], [USE_BROADCAST="no"]) AC_ARG_ENABLE([aer], AS_HELP_STRING([--enable-aer], [enable PCIe AER events (currently experimental)])) @@ -169,6 +179,7 @@ compile time options summary ============================ Sqlite3 : $USE_SQLITE3 + broadcast : $USE_BROADCAST AER : $USE_AER MCE : $USE_MCE EXTLOG : $USE_EXTLOG diff --git a/ras-aer-handler.c b/ras-aer-handler.c index 8ddd439..6edcd70 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -25,6 +25,7 @@ #include "ras-logger.h" #include "bitfield.h" #include "ras-report.h" +#include "ras-server.h" /* bit field meaning for correctable error */ static const char *aer_cor_errors[32] = { @@ -151,5 +152,10 @@ int ras_aer_event_handler(struct trace_seq *s, ras_report_aer_event(ras, &ev); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(AER_EVENT, &ev); +#endif + return 0; } diff --git a/ras-arm-handler.c b/ras-arm-handler.c index 2f170e2..dccd36e 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -20,6 +20,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-server.h" int ras_arm_event_handler(struct trace_seq *s, struct pevent_record *record, @@ -88,5 +89,10 @@ int ras_arm_event_handler(struct trace_seq *s, ras_report_arm_event(ras, &ev); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(ARM_EVENT, &ev); +#endif + return 0; } diff --git a/ras-devlink-handler.c b/ras-devlink-handler.c index e52d66e..197d613 100644 --- a/ras-devlink-handler.c +++ b/ras-devlink-handler.c @@ -25,6 +25,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-server.h" int ras_net_xmit_timeout_handler(struct trace_seq *s, struct pevent_record *record, @@ -148,5 +149,10 @@ int ras_devlink_event_handler(struct trace_seq *s, ras_report_devlink_event(ras, &ev); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(DEVLINK_EVENT, &ev); +#endif + return 0; } diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c index b16319f..51f1a11 100644 --- a/ras-diskerror-handler.c +++ b/ras-diskerror-handler.c @@ -29,6 +29,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-server.h" static const struct { @@ -127,6 +128,11 @@ int ras_diskerror_event_handler(struct trace_seq *s, /* Report event to ABRT */ ras_report_diskerror_event(ras, &ev); #endif + +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(DISKERROR_EVENT, &ev); +#endif free(ev.dev); return 0; } diff --git a/ras-events.c b/ras-events.c index c797b20..27ef509 100644 --- a/ras-events.c +++ b/ras-events.c @@ -38,6 +38,7 @@ #include "ras-devlink-handler.h" #include "ras-diskerror-handler.h" #include "ras-record.h" +#include "ras-server.h" #include "ras-logger.h" #include "ras-page-isolation.h" @@ -761,7 +762,7 @@ static int add_event_handler(struct ras_events *ras, struct pevent *pevent, return 0; } -int handle_ras_events(int record_events) +int handle_ras_events(int record_events, int broadcast_events) { int rc, page_size, i; int num_events = 0; @@ -809,6 +810,9 @@ int handle_ras_events(int record_events) ras_page_account_init(); #endif + if (broadcast_events) + ras->broadcast_events = ras_server_start() == 0; + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", ras_mc_event_handler, NULL, MC_EVENT); if (!rc) @@ -958,6 +962,9 @@ int handle_ras_events(int record_events) if (pevent) pevent_free(pevent); + if(ras->broadcast_events) + ras_server_stop(); + if (ras) { for (i = 0; i < NR_EVENTS; i++) { if (ras->filters[i]) diff --git a/ras-events.h b/ras-events.h index f028741..0183f79 100644 --- a/ras-events.h +++ b/ras-events.h @@ -50,6 +50,7 @@ struct ras_events { /* Booleans */ unsigned use_uptime: 1; unsigned record_events: 1; + unsigned broadcast_events: 1; /* For timestamp */ time_t uptime_diff; @@ -99,6 +100,6 @@ enum ghes_severity { /* Function prototypes */ int toggle_ras_mc_event(int enable); -int handle_ras_events(int record_events); +int handle_ras_events(int record_events, int broadcast_events); #endif diff --git a/ras-mc-handler.c b/ras-mc-handler.c index 42b05cd..a24b8f9 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -25,6 +25,7 @@ #include "ras-logger.h" #include "ras-page-isolation.h" #include "ras-report.h" +#include "ras-server.h" int ras_mc_event_handler(struct trace_seq *s, struct pevent_record *record, @@ -195,6 +196,10 @@ int ras_mc_event_handler(struct trace_seq *s, ras_report_mc_event(ras, &ev); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(MC_EVENT, &ev); +#endif return 0; parse_error: diff --git a/ras-mce-handler.c b/ras-mce-handler.c index 016acae..945fcaf 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c @@ -27,6 +27,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-server.h" /* * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, @@ -468,5 +469,10 @@ int ras_mce_event_handler(struct trace_seq *s, ras_report_mce_event(ras, &e); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(MCE_EVENT, &e); +#endif + return 0; } diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c index 1862335..c7a2699 100644 --- a/ras-non-standard-handler.c +++ b/ras-non-standard-handler.c @@ -21,6 +21,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-server.h" static struct ras_ns_ev_decoder *ras_ns_ev_dec_list; @@ -224,6 +225,11 @@ int ras_non_standard_event_handler(struct trace_seq *s, ras_report_non_standard_event(ras, &ev); #endif +#ifdef HAVE_BROADCAST + if(ras->broadcast_events) + ras_server_broadcast(NON_STANDARD_EVENT, &ev); +#endif + return 0; } diff --git a/ras-server.c b/ras-server.c new file mode 100644 index 0000000..7e3cbc3 --- /dev/null +++ b/ras-server.c @@ -0,0 +1,363 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ras-server.h" +#include "ras-logger.h" + +static struct ras_server server; + +static void ras_server_cleanup(void* arg) { + // Wait until main thread finishes current broadcast + pthread_mutex_lock(&server.fd_mutex); + server.nclients = 0; // Prevent future broadcasts + pthread_mutex_unlock(&server.fd_mutex); + // De-initialize the remainder of the server attributes + pthread_mutex_destroy(&server.fd_mutex); + if(server.socketfd > 0) { + shutdown(server.socketfd, SHUT_RDWR); + close(server.socketfd); + } + if(server.fds) + free(server.fds); + memset(&server, 0, sizeof(struct ras_server)); + signal(SIGPIPE, SIG_DFL); + log(ALL, LOG_INFO, "RAS server has stoped\n"); +} + +static void cleanup_unlock_mutex(void *p) +{ + pthread_mutex_unlock(p); +} + +static void server_loop(void) { + int next_slot, server_full, rv, i; + + pthread_cleanup_push(ras_server_cleanup, NULL); + server_full = 0; + next_slot = 0; + + while(1) { + rv = poll(server.fds, server.nclients+1, -1); + if(rv < 0) { + log(ALL, LOG_ERR, "Can't poll the connection file descriptors\n"); + break; + } + + pthread_mutex_lock(&server.fd_mutex); + pthread_cleanup_push(cleanup_unlock_mutex, &server.fd_mutex); + for(i = 0; i <= server.nclients; ++i) { + struct pollfd *p = &server.fds[i]; + int clifd = 0; + + // Check if all poll() notifications have been served + if(!rv) + break; + // If a connection closed, release resources + if(p->revents & POLLHUP) { + --rv; + close(p->fd); + p->fd = -1; // stop tracking in poll() + next_slot = i; // This slot is free + // Poll the socket if it there are connection slots + if(server_full) { + server_full = 0; + server.fds[server.nclients].fd = server.socketfd; + server.fds[server.nclients].events = POLLIN; + } + log(ALL, LOG_INFO, "Client %d disconnected from RAS server\n", + next_slot); + } + // If a connection opened, set-up context and invoke the handler + else if(p->revents & POLLIN) { + --rv; + // Find the next connection slot + if(next_slot < 0) { + for(int w = 0; w < server.nclients; ++w) { + if(server.fds[w].fd < 0) { + next_slot = w; + break; + } + } + } + clifd = accept(server.socketfd, NULL, NULL); + if(clifd < 0) { + log(ALL, LOG_WARNING, "Can't accept client connection\n"); + continue; + } + // No more connection slots, stop polling the server socket + if(next_slot < 0) { + server.fds[server.nclients].fd = -1; + server_full = 1; + close(clifd); + } else { + server.fds[next_slot].fd = clifd; + next_slot = -1; + log(ALL, LOG_INFO, "Client %d connected to RAS server\n", + next_slot); + } + } + } + pthread_cleanup_pop(1); // Call pthread_mutex_unlock + } + pthread_cleanup_pop(1); // Call ras_server_cleanup +} + +void ras_server_stop(void) { + if(server.tid > 0) + pthread_cancel(server.tid); +} + +int ras_server_start(void) { + struct sockaddr_un addr; + + server.socketfd = socket(AF_UNIX, SOCK_STREAM, 0); + if(server.socketfd == -1) { + log(ALL, LOG_WARNING, "Can't create local socket for broadcasting\n"); + goto create_err; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + addr.sun_path[0] = '\0'; + strncpy(addr.sun_path+1, SOCKET_NAME, sizeof(addr.sun_path)-2); + + if(bind(server.socketfd, (struct sockaddr*)&addr, sizeof(addr))){ + log(ALL, LOG_WARNING, "Can't bind to local socket for broadcasting\n"); + goto create_err; + } + if(listen(server.socketfd, SERVER_MAX_CONN)) { + log(ALL, LOG_WARNING, "Can't listen on local socket for broadcasting\n"); + goto create_err; + } + + server.nclients = SERVER_MAX_CONN; + server.fds = calloc(server.nclients+1, sizeof(struct pollfd)); + // Set up socket watchers in poll() + server.fds[server.nclients].fd = server.socketfd; + server.fds[server.nclients].events = POLLIN; + // Set up client connection watchers in poll() + for(int i = 0; i < server.nclients; ++i) { + server.fds[i].fd = -1; + server.fds[i].events = POLLHUP; + } + + if(pthread_mutex_init(&server.fd_mutex, NULL)) { + log(ALL, LOG_WARNING, "Can't create server mutex\n"); + goto create_err; + } + if(pthread_create(&server.tid, NULL, (void*(*)(void*))server_loop, NULL)) { + log(ALL, LOG_WARNING, "Can't create server thread\n"); + goto create_err; + } + + signal(SIGPIPE, SIG_IGN); + log(ALL, LOG_INFO, "Server is listening to connections\n"); + return 0; + + create_err: + ras_server_stop(); + return -1; +} + +// Serialization function for MC events +static inline void ras_mc_msg(char *buf, struct ras_mc_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=mc," + "timestamp=%s," + "error_count=%d," + "error_type=%s," + "msg=%s," + "label=%s," + "mc_index=%c," + "top_layer=%c," + "middle_layer=%c," + "lower_layer=%c," + "address=%llu," + "grain=%llu," + "syndrome=%llu," + "driver_detail=%s", + ev->timestamp, + ev->error_count, + ev->error_type, + ev->msg, + ev->label, + ev->mc_index, + ev->top_layer, + ev->middle_layer, + ev->lower_layer, + ev->address, + ev->grain, + ev->syndrome, + ev->driver_detail); +} + +// Serialization function for AER events +static inline void ras_aer_msg(char *buf, struct ras_aer_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=aer," + "timestamp=%s," + "error_type=%s," + "dev_name=%s," + "msg=%s", + ev->timestamp, + ev->error_type, + ev->dev_name, + ev->msg); +} + +// Serialization function for MCE events +static inline void ras_mce_msg(char *buf, struct mce_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=mce," + "timestamp=%s," + "bank_name=%s," + "mc_location=%s," + "error_msg=%s", + ev->timestamp, + ev->bank_name, + ev->mc_location, + ev->error_msg); +} + +// Serialization function for non_standard events +static inline void ras_non_standard_msg(char *buf, + struct ras_non_standard_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=non_standard," + "timestamp=%s," + "severity=%s," + "length=%d", + ev->timestamp, + ev->severity, + ev->length); +} + +// Serialization function for ARM events +static inline void ras_arm_msg(char *buf, struct ras_arm_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=arm," + "timestamp=%s," + "error_count=%d," + "affinity=%d," + "mpidr=0x%lx," + "midr=0x%lx," + "running_state=%d," + "psci_state=%d", + ev->timestamp, + ev->error_count, + ev->affinity, + ev->mpidr, + ev->midr, + ev->running_state, + ev->psci_state); +} + +// Serialization function for devlink events +static inline void ras_devlink_msg(char *buf, struct devlink_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=devlink," + "timestamp=%s," + "bus_name=%s," + "dev_name=%s," + "driver_name=%s," + "reporter_name=%s," + "msg=%s", + ev->timestamp, + ev->bus_name, + ev->dev_name, + ev->driver_name, + ev->reporter_name, + ev->msg); +} + +// Serialization function for diskerror events +static inline void ras_diskerror_msg(char *buf, struct diskerror_event *ev) { + snprintf(buf, SRV_MSG_SIZE, "type=diskerror," + "timestamp=%s," + "dev=%s," + "sector=%llu," + "nr_sector=%u," + "error=%s," + "rwbs=%s," + "cmd=%s", + ev->timestamp, + ev->dev, + ev->sector, + ev->nr_sector, + ev->error, + ev->rwbs, + ev->cmd); +} + +void ras_server_broadcast(int type, void *ev) { + char msg[SRV_MSG_SIZE]; + size_t sent[SERVER_MAX_CONN]; + int conn_id[SERVER_MAX_CONN]; + int nconn, done, i, rv; + size_t size; + + if(!server.nclients) + return; + // Filter the connection slots to get only the active connections + nconn = 0; + done = 0; + pthread_mutex_lock(&server.fd_mutex); + for(i = 0; i < server.nclients; ++i) { + if(server.fds[i].fd != -1) { + conn_id[nconn] = i; + sent[nconn] = 0; + nconn++; + } + } + pthread_mutex_unlock(&server.fd_mutex); + // Nothing to do if no event or no connections + if(!nconn || !ev) + return; + + // Construct the notification message + switch (type) + { + case MC_EVENT: + ras_mc_msg(msg, (struct ras_mc_event *)ev); + break; + case AER_EVENT: + ras_aer_msg(msg, (struct ras_aer_event *)ev); + break; + case MCE_EVENT: + ras_mce_msg(msg, (struct mce_event *)ev); + break; + case NON_STANDARD_EVENT: + ras_non_standard_msg(msg, (struct ras_non_standard_event *)ev); + break; + case ARM_EVENT: + ras_arm_msg(msg, (struct ras_arm_event *)ev); + break; + case DEVLINK_EVENT: + ras_devlink_msg(msg, (struct devlink_event *)ev); + break; + case DISKERROR_EVENT: + ras_diskerror_msg(msg, (struct diskerror_event *)ev); + break; + default: + return; + } + // Send message to all clients + size = strlen(msg); + while(!done) { + done = 1; + for(i = 0; i < nconn; ++i) { + // Skip if the client already got the data or failed to get it + if(sent[i] >= size || conn_id[i] < 0) + continue; + // Write data to the socket + rv = write(server.fds[conn_id[i]].fd, msg+sent[i], size-sent[i]); + if(rv < 0) { + log(ALL, LOG_ERR, "Failed to write to client process %d\n", i); + conn_id[i] = -1; + continue; + } + sent[i] += rv; + done = done && sent[i] >= size; + } + } +} diff --git a/ras-server.h b/ras-server.h new file mode 100644 index 0000000..5b4842e --- /dev/null +++ b/ras-server.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2020 Alexandre de Limas Santana + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef __RAS_SERVER_H +#define __RAS_SERVER_H + +#include "config.h" +#include "ras-events.h" +#include "ras-mc-handler.h" +#include "ras-mce-handler.h" +#include "ras-aer-handler.h" + +/* RASdaemon Socket name (abstract namespace). */ +#define SOCKET_NAME "rasdaemon" +/* Maximum number of active clients. */ +#define SERVER_MAX_CONN 10 +/* Maximum message size in bytes. */ +#define SRV_MSG_SIZE (8192) + +struct ras_server { + pthread_t tid; // The server thread id + pthread_mutex_t fd_mutex; // Mutex for client file descriptors + int socketfd; // The local Unix socket file descriptor + struct pollfd *fds; // Connected socket and server socket fds + int nclients; // The maximum number of concurrent clients +}; + +#ifdef HAVE_BROADCAST + +int ras_server_start(void); +void ras_server_broadcast(int type, void *ev); +void ras_server_stop(void); + +#else + +inline int ras_server_start(void) { return 0; } +inline void ras_server_broadcast(int type, void *ev) {} +inline void ras_server_stop(void) {} + +#endif + +#endif diff --git a/rasdaemon.c b/rasdaemon.c index 66f4dea..a39b2bd 100644 --- a/rasdaemon.c +++ b/rasdaemon.c @@ -41,6 +41,7 @@ struct arguments { int record_events; int enable_ras; int foreground; + int broadcast_events; }; static error_t parse_opt(int k, char *arg, struct argp_state *state) @@ -58,6 +59,11 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state) case 'r': args->record_events++; break; +#endif +#ifdef HAVE_BROADCAST + case 'b': + args->broadcast_events++; + break; #endif case 'f': args->foreground++; @@ -81,7 +87,9 @@ int main(int argc, char *argv[]) {"record", 'r', 0, 0, "record events via sqlite3", 0}, #endif {"foreground", 'f', 0, 0, "run foreground, not daemonize"}, - +#ifdef HAVE_BROADCAST + {"broadcast", 'b', 0, 0, "broadcast events to other processes"}, +#endif { 0, 0, 0, 0, 0, 0 } }; const struct argp argp = { @@ -116,7 +124,7 @@ int main(int argc, char *argv[]) if (daemon(0,0)) exit(EXIT_FAILURE); - handle_ras_events(args.record_events); + handle_ras_events(args.record_events, args.broadcast_events); return 0; }