Skip to content

Commit

Permalink
rasdaemon: Add support for the CXL AER correctable errors
Browse files Browse the repository at this point in the history
Add support to log and record the CXL AER correctable errors.

The corresponding Kernel patches are here:
https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t
https://lore.kernel.org/linux-cxl/[email protected]/T/#t

Signed-off-by: Shiju Jose <[email protected]>
Reviewed-by: Jonathan Cameron <[email protected]>
Reviewed-by: Dave Jiang <[email protected]>
Signed-off-by: Mauro Carvalho Chehab <[email protected]>
  • Loading branch information
shijujose4 authored and mchehab committed Apr 30, 2023
1 parent a752491 commit a247baf
Show file tree
Hide file tree
Showing 8 changed files with 243 additions and 0 deletions.
81 changes: 81 additions & 0 deletions ras-cxl-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,14 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
#define CXL_AER_UE_IDE_TX_ERR BIT(15)
#define CXL_AER_UE_IDE_RX_ERR BIT(16)

#define CXL_AER_CE_CACHE_DATA_ECC BIT(0)
#define CXL_AER_CE_MEM_DATA_ECC BIT(1)
#define CXL_AER_CE_CRC_THRESH BIT(2)
#define CXL_AER_CE_RETRY_THRESH BIT(3)
#define CXL_AER_CE_CACHE_POISON BIT(4)
#define CXL_AER_CE_MEM_POISON BIT(5)
#define CXL_AER_CE_PHYS_LAYER_ERR BIT(6)

struct cxl_error_list {
uint32_t bit;
const char *error;
Expand All @@ -243,6 +251,16 @@ static const struct cxl_error_list cxl_aer_ue[] = {
{ .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" },
};

static const struct cxl_error_list cxl_aer_ce[] = {
{ .bit = CXL_AER_CE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" },
{ .bit = CXL_AER_CE_MEM_DATA_ECC, .error = "Memory Data ECC Error" },
{ .bit = CXL_AER_CE_CRC_THRESH, .error = "CRC Threshold Hit" },
{ .bit = CXL_AER_CE_RETRY_THRESH, .error = "Retry Threshold" },
{ .bit = CXL_AER_CE_CACHE_POISON, .error = "Received Cache Poison From Peer" },
{ .bit = CXL_AER_CE_MEM_POISON, .error = "Received Memory Poison From Peer" },
{ .bit = CXL_AER_CE_PHYS_LAYER_ERR, .error = "Received Error From Physical Layer" },
};

static int decode_cxl_error_status(struct trace_seq *s, uint32_t status,
const struct cxl_error_list *cxl_error_list,
uint8_t num_elems)
Expand Down Expand Up @@ -351,3 +369,66 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s,

return 0;
}

int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
{
int len;
unsigned long long val;
time_t now;
struct tm *tm;
struct ras_events *ras = context;
struct ras_cxl_aer_ce_event ev;

now = record->ts / user_hz + ras->uptime_diff;
tm = localtime(&now);
if (tm)
strftime(ev.timestamp, sizeof(ev.timestamp),
"%Y-%m-%d %H:%M:%S %z", tm);
else
strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
return -1;

ev.memdev = tep_get_field_raw(s, event, "memdev",
record, &len, 1);
if (!ev.memdev)
return -1;
if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
return -1;

ev.host = tep_get_field_raw(s, event, "host",
record, &len, 1);
if (!ev.host)
return -1;
if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
return -1;

if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
return -1;
ev.serial = val;
if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
return -1;

if (tep_get_field_val(s, event, "status", record, &val, 1) < 0)
return -1;
ev.error_status = val;
if (trace_seq_printf(s, "error status:") <= 0)
return -1;
if (decode_cxl_error_status(s, ev.error_status,
cxl_aer_ce, ARRAY_SIZE(cxl_aer_ce)) < 0)
return -1;

/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_cxl_aer_ce_event(ras, &ev);
#endif

#ifdef HAVE_ABRT_REPORT
/* Report event to ABRT */
ras_report_cxl_aer_ce_event(ras, &ev);
#endif

return 0;
}
4 changes: 4 additions & 0 deletions ras-cxl-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);

int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
#endif
9 changes: 9 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ int toggle_ras_mc_event(int enable)
#ifdef HAVE_CXL
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable);
#endif

free_ras:
Expand Down Expand Up @@ -1001,6 +1002,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_aer_uncorrectable_error");

rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_correctable_error",
ras_cxl_aer_ce_event_handler, NULL, CXL_AER_CE_EVENT);
if (!rc)
num_events++;
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_aer_correctable_error");
#endif

if (!num_events) {
Expand Down
1 change: 1 addition & 0 deletions ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ enum {
MF_EVENT,
CXL_POISON_EVENT,
CXL_AER_UE_EVENT,
CXL_AER_CE_EVENT,
NR_EVENTS
};

Expand Down
63 changes: 63 additions & 0 deletions ras-record.c
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,53 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve

return rc;
}

/*
* Table and functions to handle cxl:cxl_aer_correctable_error
*/
static const struct db_fields cxl_aer_ce_event_fields[] = {
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
{ .name = "timestamp", .type = "TEXT" },
{ .name = "memdev", .type = "TEXT" },
{ .name = "host", .type = "TEXT" },
{ .name = "serial", .type = "INTEGER" },
{ .name = "error_status", .type = "INTEGER" },
};

static const struct db_table_descriptor cxl_aer_ce_event_tab = {
.name = "cxl_aer_ce_event",
.fields = cxl_aer_ce_event_fields,
.num_fields = ARRAY_SIZE(cxl_aer_ce_event_fields),
};

int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev)
{
int rc;
struct sqlite3_priv *priv = ras->db_priv;

if (!priv || !priv->stmt_cxl_aer_ce_event)
return 0;
log(TERM, LOG_INFO, "cxl_aer_ce_event store: %p\n", priv->stmt_cxl_aer_ce_event);

sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 1, ev->timestamp, -1, NULL);
sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 2, ev->memdev, -1, NULL);
sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 3, ev->host, -1, NULL);
sqlite3_bind_int64(priv->stmt_cxl_aer_ce_event, 4, ev->serial);
sqlite3_bind_int(priv->stmt_cxl_aer_ce_event, 5, ev->error_status);

rc = sqlite3_step(priv->stmt_cxl_aer_ce_event);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed to do cxl_aer_ce_event step on sqlite: error = %d\n", rc);
rc = sqlite3_reset(priv->stmt_cxl_aer_ce_event);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed reset cxl_aer_ce_event on sqlite: error = %d\n",
rc);
log(TERM, LOG_INFO, "register inserted at db\n");

return rc;
}
#endif

/*
Expand Down Expand Up @@ -1032,6 +1079,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}

rc = ras_mc_create_table(priv, &cxl_aer_ce_event_tab);
if (rc == SQLITE_OK) {
rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ce_event,
&cxl_aer_ce_event_tab);
if (rc != SQLITE_OK)
goto error;
}
#endif

ras->db_priv = priv;
Expand Down Expand Up @@ -1169,6 +1224,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n",
cpu, rc);
}

if (priv->stmt_cxl_aer_ce_event) {
rc = sqlite3_finalize(priv->stmt_cxl_aer_ce_event);
if (rc != SQLITE_OK)
log(TERM, LOG_ERR,
"cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n",
cpu, rc);
}
#endif

rc = sqlite3_close_v2(db);
Expand Down
12 changes: 12 additions & 0 deletions ras-record.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ struct ras_cxl_aer_ue_event {
uint32_t *header_log;
};

struct ras_cxl_aer_ce_event {
char timestamp[64];
const char *memdev;
const char *host;
uint64_t serial;
uint32_t error_status;
};

struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
Expand All @@ -155,6 +163,7 @@ struct diskerror_event;
struct ras_mf_event;
struct ras_cxl_poison_event;
struct ras_cxl_aer_ue_event;
struct ras_cxl_aer_ce_event;

#ifdef HAVE_SQLITE3

Expand Down Expand Up @@ -190,6 +199,7 @@ struct sqlite3_priv {
#ifdef HAVE_CXL
sqlite3_stmt *stmt_cxl_poison_event;
sqlite3_stmt *stmt_cxl_aer_ue_event;
sqlite3_stmt *stmt_cxl_aer_ce_event;
#endif
};

Expand Down Expand Up @@ -220,6 +230,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);

#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
Expand All @@ -235,6 +246,7 @@ static inline int ras_store_diskerror_event(struct ras_events *ras, struct diske
static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };

#endif

Expand Down
71 changes: 71 additions & 0 deletions ras-report.c
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,30 @@ static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event
return 0;
}

static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event *ev)
{
char bt_buf[MAX_BACKTRACE_SIZE];

if (!buf || !ev)
return -1;

sprintf(bt_buf, "BACKTRACE=" \
"timestamp=%s\n" \
"memdev=%s\n" \
"host=%s\n" \
"serial=0x%lx\n" \
"error_status=%u\n", \
ev->timestamp, \
ev->memdev, \
ev->host, \
ev->serial, \
ev->error_status);

strcat(buf, bt_buf);

return 0;
}

static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
Expand Down Expand Up @@ -440,6 +464,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_AER_UE_EVENT:
rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev);
break;
case CXL_AER_CE_EVENT:
rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev);
break;
default:
return -1;
}
Expand Down Expand Up @@ -936,3 +963,47 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev
else
return -1;
}

int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev)
{
char buf[MAX_MESSAGE_SIZE];
int sockfd = 0;
int done = 0;
int rc = -1;

memset(buf, 0, sizeof(buf));

sockfd = setup_report_socket();
if (sockfd < 0)
return -1;

rc = commit_report_basic(sockfd);
if (rc < 0)
goto cxl_aer_ce_fail;

rc = commit_report_backtrace(sockfd, CXL_AER_CE_EVENT, ev);
if (rc < 0)
goto cxl_aer_ce_fail;

sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-correctable-error");
rc = write(sockfd, buf, strlen(buf) + 1);
if (rc < strlen(buf) + 1)
goto cxl_aer_ce_fail;

sprintf(buf, "REASON=%s", "CXL AER correctable error");
rc = write(sockfd, buf, strlen(buf) + 1);
if (rc < strlen(buf) + 1)
goto cxl_aer_ce_fail;

done = 1;

cxl_aer_ce_fail:

if (sockfd >= 0)
close(sockfd);

if (done)
return 0;
else
return -1;
}
2 changes: 2 additions & 0 deletions ras-report.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e
int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);

#else

Expand All @@ -54,6 +55,7 @@ static inline int ras_report_diskerror_event(struct ras_events *ras, struct disk
static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };

#endif

Expand Down

0 comments on commit a247baf

Please sign in to comment.