-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for fatal error monitoring #98
base: integ_sp7
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
{ | ||
"Configuration": [ | ||
{ | ||
"ApmlRetries": { | ||
"Description": "Number of APML retry count", | ||
"Value": 10, | ||
"MaxBoundLimit": "50" | ||
} | ||
}, | ||
{ | ||
"SystemRecovery": { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SystemRecoveryMode ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The RAS twiki page mentions SystemRecovery every wherehttps://twiki.amd.com/twiki/bin/view/SIG/Sp5OobRasSpec. |
||
"Description": "System recovery mode", | ||
"Value": "NO_RESET" | ||
} | ||
}, | ||
{ | ||
"HarvestMicrocode": { | ||
"Description": "Harvest microcode version", | ||
"Value": true | ||
} | ||
}, | ||
{ | ||
"HarvestPPIN": { | ||
"Description": "Harvest PPIN", | ||
"Value": true | ||
} | ||
}, | ||
{ | ||
"ResetSignal": { | ||
"Description": "Reset Signal Type", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it Type or Name ? if it is type it should support mutiple right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reset signal can be of 2 types - SYS_RST or RSMRST reset. By default it is SYS_RST |
||
"Value": "SYS_RST" | ||
} | ||
}, | ||
{ | ||
"SigIdOffset": { | ||
"Description": "List of Signature ID offsets", | ||
"Value": [ | ||
"0x30", | ||
"0x34", | ||
"0x28", | ||
"0x2c", | ||
"0x08", | ||
"0x0c", | ||
"null", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is null indicates here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are the deafult values [30h, 34h, 28h, 2Ch, 08h, 0Ch, null, null], which specifies an ordered list of 4-byte aligned offsets to compute the failure signature ID |
||
"null" | ||
] | ||
} | ||
}, | ||
{ | ||
"AifsArmed": { | ||
"Description": "If this field is true, AIFS flow is triggered", | ||
"Value": false | ||
} | ||
}, | ||
{ | ||
"AifsSignatureId": { | ||
"Description": "List of signature Id to check if Aifs is triggered", | ||
"Value": { | ||
"EX-WDT": "0xaea0000000000108000500b020009a00000000004d000000" | ||
} | ||
} | ||
}, | ||
{ | ||
"DisableAifsResetOnSyncfloodCounter": { | ||
"Description": "Disable AIFS Reset on syncfloow counter ", | ||
"Value": true | ||
} | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#include "cper_generator.hpp" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will continue reveiw Only after addressing all the previous comments related coding style specific. |
||
#include "interface_manager_base.hpp" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General: Enable Namespace based Code organization: Example: // File: sensor_manager.h
} // namespace sensor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack |
||
|
||
class ApmlInterfaceManager : public RasManagerBase | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class should use for implementing Interface function Logic using APML library API's |
||
{ | ||
public: | ||
/** | ||
* @brief Initializes the APML interface manager. | ||
* | ||
* This function performs any necessary initialization for the APML | ||
* interface manager. | ||
*/ | ||
virtual void init(); | ||
|
||
/** | ||
* @brief Configures the APML interface manager. | ||
* | ||
* This function configures the settings for the ADDC enablement. | ||
*/ | ||
virtual void configure(); | ||
|
||
/** | ||
* @brief Constructor for ApmlInterfaceManager. | ||
* | ||
* Initializes the ApmlInterfaceManager with the given object server, | ||
* system bus connection, and I/O service. | ||
* | ||
* @param[in] objectServer Reference to an object server for managing | ||
* D-Bus objects. | ||
* @param[in] systemBus Shared pointer to a D-Bus connection. | ||
* @param[in] io Reference to an I/O service for asynchronous operations. | ||
*/ | ||
ApmlInterfaceManager( | ||
sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus, | ||
boost::asio::io_service& io) : | ||
RasManagerBase(objectServer, systemBus, io) | ||
{} | ||
|
||
protected: | ||
std::vector<uint8_t> blockId; // Vector to hold block IDs | ||
uint32_t familyId; // Family ID | ||
std::mutex harvest_in_progress_mtx; // Mutex for synchronization | ||
bool p0AlertProcessed = false; // Flag for P0 alert processing | ||
bool p1AlertProcessed = false; // Flag for P1 alert processing | ||
uint64_t recordId = 1; // Record ID | ||
uint16_t debugLogIdOffset; // Offset for debug log ID | ||
uint32_t SignatureID[8]; // Array to hold signature IDs | ||
|
||
/** | ||
* @brief Monitors if APML interface is up | ||
* | ||
* This function monitors the status of the APML interface. | ||
*/ | ||
virtual void interfaceActiveMonitor(); | ||
|
||
/** | ||
* @brief Retrieves the CPU ID. | ||
* | ||
* This function retrieves the CPU ID from the system. | ||
*/ | ||
virtual void getCpuId(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General: Get should return some value , and also documet error scenarios. here . |
||
|
||
/** | ||
* @brief Finds the program ID. | ||
* | ||
* This function locates the program ID associated with the system. | ||
*/ | ||
virtual void findProgramId(); | ||
|
||
/** | ||
* @brief Harvests fatal error information. | ||
* | ||
* This function processes a fatal error based on its type. | ||
* | ||
* @param[in] errorType The type of fatal error to harvest. | ||
*/ | ||
virtual void harvestFatalError(uint8_t); | ||
|
||
/** | ||
* @brief Triggers a warm reset of the system. | ||
* | ||
* This function initiates a warm reset operation. | ||
*/ | ||
void triggerWarmReset() override; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Util function |
||
|
||
/** | ||
* @brief Clears the SBRMI alert mask. | ||
* | ||
* Requests de-assertion of APML_ALERT_L signal by clearing | ||
* SBRMI::Status[SwAlertSts] | ||
* | ||
* @param[in] soc_num - The socket number. | ||
*/ | ||
void clearSbrmiAlertMask(uint8_t soc_num); | ||
|
||
/** | ||
* @brief Performs platform initialization tasks. | ||
* | ||
* This function executes necessary initialization tasks specific to | ||
* the platform. | ||
*/ | ||
void performPlatformInitialization(); | ||
|
||
/** | ||
* @brief Reads a register from the specified address. | ||
* | ||
* This function reads a value from a register at a given address. | ||
*/ | ||
oob_status_t readRegister(uint8_t, uint32_t, uint8_t*); | ||
|
||
/** | ||
* @brief Writes a value to a register at a specified address. | ||
* | ||
* This function writes a value to a register at a given address. | ||
* | ||
*/ | ||
void writeRegister(uint8_t, uint32_t, uint32_t); | ||
|
||
/** | ||
* @brief Compares values using bitwise AND operation with expected values. | ||
* | ||
* This function checks if there is a match between values and expected | ||
* results using bitwise AND operation. | ||
* | ||
* @param[in] values Pointer to an array of values. | ||
* @param[in] expected The expected string representation of values. | ||
* | ||
* @return True if there is a match, false otherwise. | ||
*/ | ||
bool compare_with_bitwise_AND(const uint32_t* values, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move to utility There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack |
||
const std::string& expected); | ||
|
||
/** | ||
* @brief Checks if the signature ID matches expected values. | ||
* | ||
* This function verifies if the stored signature ID matches expected | ||
* values. | ||
* | ||
* @return True if there is a match, false otherwise. | ||
*/ | ||
bool checkSignatureIdMatch(); | ||
|
||
/** | ||
* @brief Converts a hexadecimal string to a vector of uint32_t. | ||
* | ||
* This function takes a hexadecimal string and converts it into a | ||
* vector of unsigned 32-bit integers. | ||
* | ||
* @param[in] hexString The hexadecimal string to convert. | ||
* | ||
* @return A vector containing converted values. | ||
*/ | ||
std::vector<uint32_t> hexstring_to_vector(const std::string& hexString); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move to util There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack |
||
|
||
/** | ||
* @brief MCA data harvesting. | ||
* | ||
*/ | ||
bool harvestMcaValidityCheck(uint8_t type, uint16_t* param1, | ||
uint16_t* param2); | ||
|
||
/** | ||
* @brief Harvests MCA data banks. | ||
* | ||
*/ | ||
template <typename T> | ||
void harvestMcaDataBanks(uint8_t bank, uint16_t, uint16_t, | ||
CperGenerator<T>&); | ||
|
||
/** | ||
* @brief Harvests last transaction address using command 5Ch | ||
* | ||
*/ | ||
void getLastTransAddr(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t type); | ||
|
||
void dumpContextInfo(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t type); | ||
|
||
void harvestDebugLogDump(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t, | ||
uint8_t); | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#pragma once | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This heeder file looks cleaner , align to modern c++. |
||
|
||
#include <com/amd/RAS/Configuration/common.hpp> | ||
#include <com/amd/RAS/Configuration/server.hpp> | ||
#include <sdbusplus/asio/object_server.hpp> | ||
#include <sdbusplus/server.hpp> | ||
static constexpr auto objectPath = "/com/amd/RAS"; | ||
|
||
// Type alias for attribute type from the D-Bus configuration. | ||
using AttributeType = | ||
sdbusplus::common::com::amd::ras::Configuration::AttributeType; | ||
|
||
// Type alias for attribute type from the D-Bus configuration. | ||
using Base = sdbusplus::com::amd::RAS::server::Configuration; | ||
|
||
// Type alias for attribute name and value. | ||
using AttributeName = std::string; | ||
using AttributeValue = | ||
std::variant<bool, std::string, int64_t, std::vector<std::string>, | ||
std::map<std::string, std::string>>; | ||
using ConfigTable = | ||
std::map<std::string, | ||
std::tuple<AttributeType, std::string, | ||
std::variant<bool, std::string, int64_t, | ||
std::vector<std::string>, | ||
std::map<std::string, std::string>>, | ||
int64_t>>; | ||
|
||
// Type alias for the configuration table structure. | ||
struct EventDeleter | ||
{ | ||
void operator()(sd_event* event) const | ||
{ | ||
event = sd_event_unref(event); | ||
} | ||
}; | ||
|
||
// Unique pointer type for sd_event with custom deleter. | ||
using EventPtr = std::unique_ptr<sd_event, EventDeleter>; | ||
|
||
/** | ||
* @brief Definition of the RasConfiguration class. | ||
* | ||
* @tparam AttributeName The type for attribute names (usually std::string). | ||
* @tparam AttributeValue The variant type for attribute values. | ||
* @tparam ConfigTable The map type for storing attribute information. | ||
*/ | ||
|
||
class RasConfiguration : public Base | ||
{ | ||
public: | ||
RasConfiguration(sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus); | ||
|
||
void setAttribute(AttributeName attribute, AttributeValue value) override; | ||
|
||
AttributeValue getAttribute(AttributeName attribute) override; | ||
|
||
private: | ||
sdbusplus::asio::object_server& objServer; | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really going to try 50 times?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should allow max bound limit as 50 as internal validation team may set that as high for some rare case failures and debug it.