Skip to content

Commit

Permalink
rasdaemon: Add support for the CXL memory module events
Browse files Browse the repository at this point in the history
Add support to log and record the CXL memory module events.

Signed-off-by: Shiju Jose <[email protected]>
Signed-off-by: Mauro Carvalho Chehab <[email protected]>
  • Loading branch information
shijujose4 authored and mchehab committed Oct 23, 2023
1 parent 9a2f618 commit f63b4c9
Show file tree
Hide file tree
Showing 8 changed files with 375 additions and 0 deletions.
156 changes: 156 additions & 0 deletions ras-cxl-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -1016,3 +1016,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,

return 0;
}

/*
* Memory Module Event Record - MMER
*
* CXL res 3.0 section 8.2.9.2.1.3; Table 8-45
*/
static const char* cxl_dev_evt_type[] = {
"Health Status Change",
"Media Status Change",
"Life Used Change",
"Temperature Change",
"Data Path Error",
"LSA Error",
};

/*
* Device Health Information - DHI
*
* CXL res 3.0 section 8.2.9.8.3.1; Table 8-100
*/
#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0)
#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1)
#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2)

static const struct cxl_event_flags cxl_health_status[] = {
{ .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" },
{ .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" },
{ .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" },
};

static const char* cxl_media_status[] = {
"Normal",
"Not Ready",
"Write Persistency Lost",
"All Data Lost",
"Write Persistency Loss in the Event of Power Loss",
"Write Persistency Loss in Event of Shutdown",
"Write Persistency Loss Imminent",
"All Data Loss in Event of Power Loss",
"All Data loss in the Event of Shutdown",
"All Data Loss Imminent",
};

static const char* cxl_two_bit_status[] = {
"Normal",
"Warning",
"Critical",
};

static const char* cxl_one_bit_status[] = {
"Normal",
"Warning",
};

#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3)
#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2)
#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4)
#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5)

int ras_cxl_memory_module_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
{
unsigned long long val;
struct ras_events *ras = context;
struct ras_cxl_memory_module_event ev;

memset(&ev, 0, sizeof(ev));
if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
return -1;

if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0)
return -1;
ev.event_type = val;
if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type,
ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0)
return -1;

if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0)
return -1;
ev.health_status = val;
if (trace_seq_printf(s, "health_status:") <= 0)
return -1;
if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status,
ARRAY_SIZE(cxl_health_status)) < 0)
return -1;

if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0)
return -1;
ev.media_status = val;
if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status,
ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0)
return -1;

if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0)
return -1;
ev.add_status = val;
if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status,
ARRAY_SIZE(cxl_two_bit_status),
CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0)
return -1;
if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status,
ARRAY_SIZE(cxl_two_bit_status),
CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0)
return -1;
if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
ARRAY_SIZE(cxl_one_bit_status),
CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0)
return -1;
if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
ARRAY_SIZE(cxl_one_bit_status),
CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0)
return -1;

if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0)
return -1;
ev.life_used = val;
if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0)
return -1;

if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0)
return -1;
ev.device_temp = val;
if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0)
return -1;

if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0)
return -1;
ev.dirty_shutdown_cnt = val;
if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0)
return -1;

if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0)
return -1;
ev.cor_vol_err_cnt = val;
if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0)
return -1;

if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0)
return -1;
ev.cor_per_err_cnt = val;
if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0)
return -1;

/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_cxl_memory_module_event(ras, &ev);
#endif

#ifdef HAVE_ABRT_REPORT
/* Report event to ABRT */
ras_report_cxl_memory_module_event(ras, &ev);
#endif

return 0;
}
3 changes: 3 additions & 0 deletions ras-cxl-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
int ras_cxl_dram_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
int ras_cxl_memory_module_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
#endif
9 changes: 9 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable);
#endif

free_ras:
Expand Down Expand Up @@ -1081,6 +1082,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_dram");

rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module",
ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT);
if (!rc)
num_events++;
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "memory_module");
#endif

if (!num_events) {
Expand Down
1 change: 1 addition & 0 deletions ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ enum {
CXL_GENERIC_EVENT,
CXL_GENERAL_MEDIA_EVENT,
CXL_DRAM_EVENT,
CXL_MEMORY_MODULE_EVENT,
NR_EVENTS
};

Expand Down
84 changes: 84 additions & 0 deletions ras-record.c
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *

return rc;
}

/*
* Table and functions to handle cxl:cxl_memory_module_event
*/
static const struct db_fields cxl_memory_module_event_fields[] = {
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
{ .name = "timestamp", .type = "TEXT" },
{ .name = "memdev", .type = "TEXT" },
{ .name = "host", .type = "TEXT" },
{ .name = "serial", .type = "INTEGER" },
{ .name = "log_type", .type = "TEXT" },
{ .name = "hdr_uuid", .type = "TEXT" },
{ .name = "hdr_flags", .type = "INTEGER" },
{ .name = "hdr_handle", .type = "INTEGER" },
{ .name = "hdr_related_handle", .type = "INTEGER" },
{ .name = "hdr_ts", .type = "TEXT" },
{ .name = "hdr_length", .type = "INTEGER" },
{ .name = "hdr_maint_op_class", .type = "INTEGER" },
{ .name = "event_type", .type = "INTEGER" },
{ .name = "health_status", .type = "INTEGER" },
{ .name = "media_status", .type = "INTEGER" },
{ .name = "life_used", .type = "INTEGER" },
{ .name = "dirty_shutdown_cnt", .type = "INTEGER" },
{ .name = "cor_vol_err_cnt", .type = "INTEGER" },
{ .name = "cor_per_err_cnt", .type = "INTEGER" },
{ .name = "device_temp", .type = "INTEGER" },
{ .name = "add_status", .type = "INTEGER" },
};

static const struct db_table_descriptor cxl_memory_module_event_tab = {
.name = "cxl_memory_module_event",
.fields = cxl_memory_module_event_fields,
.num_fields = ARRAY_SIZE(cxl_memory_module_event_fields),
};

int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev)
{
int rc;
struct sqlite3_priv *priv = ras->db_priv;

if (!priv || !priv->stmt_cxl_memory_module_event)
return 0;
log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n",
priv->stmt_cxl_memory_module_event);

ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp);
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status);

rc = sqlite3_step(priv->stmt_cxl_memory_module_event);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc);
rc = sqlite3_reset(priv->stmt_cxl_memory_module_event);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
log(TERM, LOG_ERR,
"Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc);
log(TERM, LOG_INFO, "register inserted at db\n");

return rc;
}
#endif

/*
Expand Down Expand Up @@ -1391,6 +1459,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}

rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab);
if (rc == SQLITE_OK) {
rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event,
&cxl_memory_module_event_tab);
if (rc != SQLITE_OK)
goto error;
}
#endif

ras->db_priv = priv;
Expand Down Expand Up @@ -1568,6 +1644,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n",
cpu, rc);
}

if (priv->stmt_cxl_memory_module_event) {
rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event);
if (rc != SQLITE_OK)
log(TERM, LOG_ERR,
"cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n",
cpu, rc);
}
#endif

rc = sqlite3_close_v2(db);
Expand Down
17 changes: 17 additions & 0 deletions ras-record.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,19 @@ struct ras_cxl_dram_event {
uint16_t validity_flags;
};

struct ras_cxl_memory_module_event {
struct ras_cxl_event_common_hdr hdr;
uint8_t event_type;
uint8_t health_status;
uint8_t media_status;
uint8_t life_used;
uint32_t dirty_shutdown_cnt;
uint32_t cor_vol_err_cnt;
uint32_t cor_per_err_cnt;
int16_t device_temp;
uint8_t add_status;
};

struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
Expand All @@ -234,6 +247,7 @@ struct ras_cxl_overflow_event;
struct ras_cxl_generic_event;
struct ras_cxl_general_media_event;
struct ras_cxl_dram_event;
struct ras_cxl_memory_module_event;

#ifdef HAVE_SQLITE3

Expand Down Expand Up @@ -274,6 +288,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_generic_event;
sqlite3_stmt *stmt_cxl_general_media_event;
sqlite3_stmt *stmt_cxl_dram_event;
sqlite3_stmt *stmt_cxl_memory_module_event;
#endif
};

Expand Down Expand Up @@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow
int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev);

#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
Expand All @@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra
static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; };

#endif

Expand Down
Loading

0 comments on commit f63b4c9

Please sign in to comment.