diff --git a/README.md b/README.md index 2656029..68f066e 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,13 @@ This release supports Dell and HPE servers and collects: This feature uses [Redfish](https://www.dmtf.org/standards/redfish) protocol with both generic and OEM-specific endpoints. -For more details and usage, see the specific documentation. +If the server is connected to a [PDU](https://en.wikipedia.org/wiki/Power_distribution_unit), and only if the monitoring feature is enabled, +hwbench can collect power metrics from it. + +This release supports the following brands: + - Raritan + +For more details and usage, see the specific [documentation](./documentation/monitoring.md) # How can results be analyzed? **hwgraph** tool, bundled in the same repository, generates graphs from **hwbench** output files. diff --git a/documentation/monitoring.md b/documentation/monitoring.md new file mode 100644 index 0000000..725f8b3 --- /dev/null +++ b/documentation/monitoring.md @@ -0,0 +1,113 @@ +# Monitoring + +hwbench is using a specific monitoring engine to collect data from several sources : +- BMC +- PDU +- Turbostat + +Regarding the source, power, frequencies, thermal or cooling metrics are collected. + +# Concept +During each benchmark, and if the monitoring is enabled, metrics are collected every 2 seconds and aggregated every 10 seconds to get statistics over this period of time. + +At the end of the benchmark the monitoring metrics are added in the a result file. hwgraph will use them to plot how these components behave during the benchmark. + +# Usage +To enable the monitoring feature, just set the `monitor` directive in the configuration file. + +As per this release, only `monitor=all` is supported. In future releases, it will be possible to list the source to be considered. + +# Configuration file +When monitoring is enabled, the `-m ` option must be used to describe the server's configuration. +This file is separated from the job file as it could be specific to each host. + +Each source is defined like the following : + +``` + [section_name] + username= + password= + type= + url= + ``` + + ## BMC + When defining a BMC, the `type` must be set to `BMC`. The `section_name` could the hardware vendor name like `DELL`, `HPE`, or `default`. + + A typical example looks like : + +``` +[HPE] +username=Administrator +password=YOURPASSWORD +type=BMC +``` + +**Note**: if no ``url`` parameter is provided, it will be automatically detected at runtime via ipmitool tool (or ilorest on HPE systems). + +A single BMC configuration will be used per server and vendor specific will be selected first if matching the running hardware. + +The BMC code is using `redfish` endpoints to monitor the server. Vendor specific endpoints can be used in addition of the generic ones to get all meaningful metrics. + +Hwbench monitoring code requires the BMC to be reachable from the host. + +## PDU +When defining a PDU, +- a `section_name`, a user-defined value to represent this PDU +- the ``type`` must be set to `PDU`. +- a ``driver`` must be chosen +- an ``URL`` is required +- the ``outlet`` port must be selected + + A typical example looks like : + +``` +[myPDU] +username=admin +password=admin +type=PDU +driver=raritan +url=http://mypdu/ +outlet=21 +``` + +**Note**: Several PDU configurations can be defined and used simultaneously. + + +### Driver +There exist many PDU providers and the software quality may vary a lot and so the protocols. To ensure a good compatibility with them, drivers can be added to hwbench. + +For this release, only **raritan** driver exists but as it uses some redfish endpoints, it might work on other products. + +If you have tested it on some other PDUs or have created a custom driver, feel free to push a PR for review. + +**Note**: The Raritan driver only exports the power in Watts but can be expanded easily to get more metrics. + + + +## URL +The url cannot be automatically detected so it must be provided to hwbench. + +## Outlet +This directive selects the physical outlet where the server is connected. + +## Outletgroup +Some products support outlet groups where outlets from different PDUs are grouped in a single `outletgroup`. + +If the PDU supports it, the `outletgroup` can be used to specify which one to use. +A typical example looks like : + +``` +[PDU_with_grouped_outlets] +username=admin +password=admin +type=PDU +driver=raritan +url=https://mypdu/ +outletgroup=1 +``` + +**Note**: ``outlet`` and ``outletgroup` are mutually exclusive. + +# Turbostat +Turbostat will be automatically used on x86_64 systems if already installed on the server with release >= 2022.04.16. No configuration is required. \ No newline at end of file diff --git a/graph/hwgraph.py b/graph/hwgraph.py index 46fd1e7..798a8d3 100755 --- a/graph/hwgraph.py +++ b/graph/hwgraph.py @@ -90,7 +90,7 @@ def compare_traces(args) -> None: def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) -> int: rendered_graphs = 0 bench = trace.bench(bench_name) - for metric_name in ["BMC", "CPU"]: + for metric_name in ["BMC", "CPU", "PDU"]: metrics = bench.get_component(Metrics.MONITOR, metric_name) if metrics: for metric in metrics: @@ -148,6 +148,28 @@ def graph_cpu(args, trace: Trace, bench_name: str, output_dir) -> int: return rendered_graphs +def graph_pdu(args, trace: Trace, bench_name: str, output_dir) -> int: + rendered_graphs = 0 + bench = trace.bench(bench_name) + pdu_graphs = {} + pdu_graphs["PDU power reporting"] = {Metrics.POWER_CONSUMPTION: "PDU"} + for graph_name in pdu_graphs: + # Let's render the performance, perf_per_temp, perf_per_watt graphs + for metric, filter in pdu_graphs[graph_name].items(): + for second_axis in [None, Metrics.THERMAL, Metrics.POWER_CONSUMPTION]: + rendered_graphs += generic_graph( + args, + output_dir, + bench, + metric, + graph_name, + second_axis, + filter=filter, + ) + + return rendered_graphs + + def graph_thermal(args, trace: Trace, bench_name: str, output_dir) -> int: rendered_graphs = 0 rendered_graphs += generic_graph( @@ -219,6 +241,7 @@ def valid_traces(args): ) rendered_graphs += graph_fans(args, trace, bench_name, output_dir) rendered_graphs += graph_cpu(args, trace, bench_name, output_dir) + rendered_graphs += graph_pdu(args, trace, bench_name, output_dir) rendered_graphs += graph_thermal(args, trace, bench_name, output_dir) return rendered_graphs diff --git a/hwbench/bench/benchmarks.py b/hwbench/bench/benchmarks.py index f4b87bf..4e6194f 100644 --- a/hwbench/bench/benchmarks.py +++ b/hwbench/bench/benchmarks.py @@ -158,6 +158,10 @@ def __schedule_benchmark( # If job needs monitoring, let's create it if monitoring_config != "none" and not self.monitoring: self.hardware.vendor.get_bmc().connect_redfish() + self.hardware.vendor.get_bmc().detect() + for pdu in self.hardware.vendor.get_pdus(): + pdu.connect_redfish() + pdu.detect() self.monitoring = Monitoring(self.out_dir, self.jobs_config, self.hardware) # For each stressor, add a benchmark object to the list diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py index ee27d68..9507180 100644 --- a/hwbench/bench/monitoring.py +++ b/hwbench/bench/monitoring.py @@ -59,6 +59,7 @@ def prepare(self): """Preparing the monitoring""" v = self.vendor bmc = self.vendor.get_bmc() + pdus = self.vendor.get_pdus() def check_monitoring(source: str, metric: Metrics): data = self.get_metric(metric) @@ -83,9 +84,14 @@ def check_monitoring(source: str, metric: Metrics): check_monitoring("turbostat", Metrics.FREQ) print( - f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} driver @ {bmc.get_ip()}" + f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} {bmc.get_detect_string()}" ) + for pdu in pdus: + print( + f"Monitoring/PDU: initialize {pdu.get_name()} with {pdu.get_driver_name()} {pdu.get_detect_string()}" + ) + # - checking if the bmc monitoring works # These calls will also initialize the datastructures out of the monitoring loop self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL)) @@ -104,6 +110,12 @@ def check_monitoring(source: str, metric: Metrics): ) check_monitoring("BMC", Metrics.POWER_SUPPLIES) + # - checking if pdu monitoring works + if pdus: + for pdu in pdus: + pdu.read_power_consumption(self.get_metric(Metrics.POWER_CONSUMPTION)) + check_monitoring("PDU", Metrics.POWER_CONSUMPTION) + def __monitor_bmc(self): """Monitor the bmc metrics""" self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL)) @@ -115,6 +127,11 @@ def __monitor_bmc(self): self.get_metric(Metrics.POWER_SUPPLIES) ) + def __monitor_pdus(self): + """Monitor the PDU metrics""" + for pdu in self.vendor.get_pdus(): + pdu.read_power_consumption(self.get_metric(Metrics.POWER_CONSUMPTION)) + def __compact(self): """Compute statistics""" for metric_name, metric_type in self.metrics.items(): @@ -158,6 +175,7 @@ def __monitor(self, precision: int, frequency: int, duration: int): self.metrics[str(MonitoringMetadata.ITERATION_TIME)] = frequency * precision self.metrics[str(Metrics.MONITOR)] = { "BMC": {"Polling": MonitorMetric("Polling", "ms")}, + "PDU": {"Polling": MonitorMetric("Polling", "ms")}, "CPU": {"Polling": MonitorMetric("Polling", "ms")}, } # When will we hit "duration" ? @@ -187,14 +205,23 @@ def next_iter(): start_bmc = self.get_monotonic_clock() self.__monitor_bmc() - end_bmc = self.get_monotonic_clock() + end_monitoring = self.get_monotonic_clock() # Let's monitor the time spent at monitoring the BMC self.get_metric(Metrics.MONITOR)["BMC"]["Polling"].add( - (end_bmc - start_bmc) * 1e-6 + (end_monitoring - start_bmc) * 1e-6 ) + if self.vendor.get_pdus(): + start_pdu = self.get_monotonic_clock() + self.__monitor_pdus() + end_monitoring = self.get_monotonic_clock() + # Let's monitor the time spent at monitoring the PDUs + self.get_metric(Metrics.MONITOR)["PDU"]["Polling"].add( + (end_monitoring - start_pdu) * 1e-6 + ) + # We compute the time spent since we started this iteration - monitoring_duration = end_bmc - start_time + monitoring_duration = end_monitoring - start_time # Based on the time passed, let's compute the amount of sleep time # to keep in sync with the expected precision @@ -202,7 +229,7 @@ def next_iter(): sleep_time = sleep_time_ns / 1e9 # If the the current time + sleep_time is above the total duration (we accept up to 500ms overdue) - if (end_bmc + monitoring_duration + sleep_time_ns) > ( + if (end_monitoring + monitoring_duration + sleep_time_ns) > ( end_of_run + 0.5 * 1e9 ): # We can stop the monitoring, no more measures will be done diff --git a/hwbench/bench/monitoring_structs.py b/hwbench/bench/monitoring_structs.py index 56aaae3..9997dab 100644 --- a/hwbench/bench/monitoring_structs.py +++ b/hwbench/bench/monitoring_structs.py @@ -150,6 +150,7 @@ def __str__(self) -> str: class PowerContext(Enum): BMC = "BMC" + PDU = "PDU" CPU = "CPU" def __str__(self) -> str: @@ -166,6 +167,7 @@ class PowerCategories(Enum): INFRASTRUCTURE = "Infrastructure" # = Chassis - servers (fans, pdb, ..) SERVERINCHASSIS = "ServerInChassis" # One server + its part of the chassis SERVER = "Server" # One server + PDU = "Pdu" def __str__(self) -> str: return str(self.value) diff --git a/hwbench/environment/hardware.py b/hwbench/environment/hardware.py index 7520905..5ef94b3 100644 --- a/hwbench/environment/hardware.py +++ b/hwbench/environment/hardware.py @@ -49,11 +49,15 @@ def __init__(self, out_dir: pathlib.Path, monitoring_config): External_Simple(self.out_dir, ["ipmitool", "sdr"], "ipmitool-sdr") def dump(self) -> dict[str, Optional[str | int] | dict]: - return { + dump = { "dmi": self.dmi.dump(), "cpu": self.cpu.dump(), "bmc": self.vendor.get_bmc().dump(), + "pdu": {}, } + for pdu in self.vendor.get_pdus(): + dump["pdu"][pdu.get_name()] = pdu.dump() + return dump def cpu_flags(self) -> list[str]: return self.cpu.get_flags() diff --git a/hwbench/environment/test_dell.py b/hwbench/environment/test_dell.py index 73d35cd..ea79b3a 100644 --- a/hwbench/environment/test_dell.py +++ b/hwbench/environment/test_dell.py @@ -16,7 +16,7 @@ class TestDell(TestVendors): def __init__(self, *args, **kwargs): - super().__init__(Dell("", None, None), *args, **kwargs) + super().__init__(Dell("", None, "tests/mocked_monitoring.cfg"), *args, **kwargs) self.path = "tests/vendors/Dell/C6615/" def setUp(self): diff --git a/hwbench/environment/test_hpe.py b/hwbench/environment/test_hpe.py index 9faf764..7587b7b 100644 --- a/hwbench/environment/test_hpe.py +++ b/hwbench/environment/test_hpe.py @@ -16,7 +16,7 @@ class TestGenericHpe(TestVendors): def __init__(self, path: str, *args, **kwargs): - super().__init__(Hpe("", None, None), *args, **kwargs) + super().__init__(Hpe("", None, "tests/mocked_monitoring.cfg"), *args, **kwargs) self.path = path def setUp(self): diff --git a/hwbench/environment/test_parse.py b/hwbench/environment/test_parse.py index b40c21f..469e613 100644 --- a/hwbench/environment/test_parse.py +++ b/hwbench/environment/test_parse.py @@ -259,4 +259,4 @@ def test_ipmitool_parsing(self): stdout = (d / "stdout").read_bytes() stderr = (d / "stderr").read_bytes() test_target.parse_cmd(stdout, stderr) - assert test_target.get_ip() == "10.168.97.137" + assert test_target.get_url() == "https://10.168.97.137" diff --git a/hwbench/environment/test_vendors.py b/hwbench/environment/test_vendors.py index c44eb72..1c6743e 100644 --- a/hwbench/environment/test_vendors.py +++ b/hwbench/environment/test_vendors.py @@ -63,10 +63,15 @@ def setUp(self): # Vendors will override this function to add their specifics # Once done, they will this helper self.install_patch( - "hwbench.environment.vendors.vendor.BMC.connect_redfish", + "hwbench.environment.vendors.bmc.BMC.connect_redfish", PATCH_TYPES.RETURN_VALUE, None, ) + self.install_patch( + "hwbench.environment.vendors.vendor.Vendor.find_monitoring_sections", + PATCH_TYPES.RETURN_VALUE, + [], + ) self.get_vendor().prepare() # tearDown is called at the end of the test diff --git a/hwbench/environment/vendors/bmc.py b/hwbench/environment/vendors/bmc.py new file mode 100644 index 0000000..a3bdfd4 --- /dev/null +++ b/hwbench/environment/vendors/bmc.py @@ -0,0 +1,153 @@ +import pathlib +from .monitoring_device import MonitoringDevice +from ...utils import helpers as h +from ...utils.external import External +from ...bench.monitoring_structs import ( + FanContext, + Power, + PowerCategories, + PowerContext, + MonitorMetric, + Temperature, +) + + +class BMC(MonitoringDevice, External): + def __init__(self, out_dir: pathlib.Path, vendor): + MonitoringDevice.__init__(self, vendor) + External.__init__(self, out_dir) + self.bmc = {} # type: dict[str, str] + self.bmc_section = None + + # For testing purposes, vendor can be None + if self.vendor: + bmc_sections = vendor.find_monitoring_sections("BMC") + if bmc_sections: + self.bmc_section = bmc_sections[0] + + def run_cmd(self) -> list[str]: + return ["ipmitool", "lan", "print"] + + def parse_cmd(self, stdout: bytes, _stderr: bytes): + for row in stdout.split(b"\n"): + if b": " in row: + key, value = row.split(b": ", 1) + if key.strip(): + self.bmc[key.strip().decode("utf-8")] = value.strip().decode( + "utf-8" + ) + return self.bmc + + def run_cmd_version(self) -> list[str]: + return ["ipmitool", "-V"] + + def parse_version(self, stdout: bytes, _stderr: bytes) -> bytes: + self.version = stdout.split()[2] + return self.version + + @property + def name(self) -> str: + return "ipmitool-lan-print" + + def get_url(self) -> str: + """Extract the BMC url.""" + # For testing purposes, vendor can be None + if self.vendor: + # If the configuration file provides and url, let's use it + url = self.vendor.monitoring_config_file.get( + self.bmc_section, "url", fallback="" + ) + if url: + return url + + # If no url provided, let's use the ipmi address + + try: + return f"https://{self.bmc['IP Address']}" + except KeyError: + h.fatal("Cannot detect BMC url") + + def connect_redfish(self): + """Connect to the BMC using Redfish.""" + sections = self.vendor.find_monitoring_sections( + "BMC", [self.vendor.name(), "default"], max_sections=1 + ) + if not sections: + h.fatal( + "Cannot find any valid BMC entry of the monitoring configuration file" + ) + + bmc_username = self.vendor.monitoring_config_file.get(sections[0], "username") + bmc_password = self.vendor.monitoring_config_file.get(sections[0], "password") + return super().connect_redfish(bmc_username, bmc_password, self.get_url()) + + def get_thermal(self): + return {} + + def read_thermals( + self, thermals: dict[str, dict[str, Temperature]] = {} + ) -> dict[str, dict[str, Temperature]]: + """Return thermals from server""" + # To be implemented by vendors + return {} + + def read_fans( + self, fans: dict[str, dict[str, MonitorMetric]] = {} + ) -> dict[str, dict[str, MonitorMetric]]: + """Return fans from server""" + # Generic for now, could be override by vendors + if str(FanContext.FAN) not in fans: + fans[str(FanContext.FAN)] = {} # type: ignore[no-redef] + for f in self.get_thermal().get("Fans"): + name = f["Name"] + if name not in fans[str(FanContext.FAN)]: + fans[str(FanContext.FAN)][name] = MonitorMetric( + f["Name"], f["ReadingUnits"] + ) + fans[str(FanContext.FAN)][name].add(f["Reading"]) + return fans + + def get_power(self): + """Return the power metrics.""" + return {} + + def read_power_consumption( + self, power_consumption: dict[str, dict[str, Power]] = {} + ) -> dict[str, dict[str, Power]]: + """Return power consumption from server""" + # Generic for now, could be override by vendors + if str(PowerContext.BMC) not in power_consumption: + power_consumption[str(PowerContext.BMC)] = { + str(PowerCategories.SERVER): Power(str(PowerCategories.SERVER)) + } # type: ignore[no-redef] + + power_consumption[str(PowerContext.BMC)][str(PowerCategories.SERVER)].add( + self.get_power().get("PowerControl")[0]["PowerConsumedWatts"] + ) + return power_consumption + + def read_power_supplies( + self, power_supplies: dict[str, dict[str, Power]] = {} + ) -> dict[str, dict[str, Power]]: + """Return power supplies power from server""" + # Generic for now, could be override by vendors + if str(PowerContext.BMC) not in power_supplies: + power_supplies[str(PowerContext.BMC)] = {} # type: ignore[no-redef] + for psu in self.get_power().get("PowerSupplies"): + psu_name = psu["Name"].split()[0] + if psu["Name"] not in power_supplies[str(PowerContext.BMC)]: + power_supplies[str(PowerContext.BMC)][psu["Name"]] = Power(psu_name) + power_supplies[str(PowerContext.BMC)][psu["Name"]].add( + psu["PowerInputWatts"] + ) + return power_supplies + + def detect(self): + """Detect monitoring device""" + bmc_info = self.get_redfish_url("/redfish/v1/Managers/") + members = bmc_info.get("Members") + if not members: + h.fatal("BMC: No member detected in 'Managers' endpoint") + bmc_info = self.get_redfish_url(members[0]["@odata.id"]) + self.firmware_version = bmc_info.get("FirmwareVersion") + self.model = bmc_info.get("Model") diff --git a/hwbench/environment/vendors/hpe/hpe.py b/hwbench/environment/vendors/hpe/hpe.py index ed6bff4..c40aa35 100644 --- a/hwbench/environment/vendors/hpe/hpe.py +++ b/hwbench/environment/vendors/hpe/hpe.py @@ -11,6 +11,7 @@ ) from ..vendor import Vendor, BMC from .ilorest import Ilorest, IlorestServerclone, ILOREST +from ....utils import helpers as h class ILO(BMC): @@ -18,8 +19,19 @@ def __init__(self, out_dir: pathlib.Path, vendor: Vendor, ilo: ILOREST): super().__init__(out_dir, vendor) self.ilo = ilo - def get_ip(self) -> str: - return self.ilo.get_ip() + def get_url(self) -> str: + # If the configuration file provides and url, let's use it + url = self.vendor.monitoring_config_file.get( + self.bmc_section, "url", fallback="" + ) + if url: + return url + + ipv4 = self.ilo.get_bmc_ipv4() + if ipv4: + return f"https://{ipv4}" + + h.fatal("Cannot detect BMC url") def get_thermal(self): return self.get_redfish_url("/redfish/v1/Chassis/1/Thermal") @@ -139,11 +151,9 @@ def get_oem_chassis(self): class Hpe(Vendor): def __init__(self, out_dir, dmi, monitoring_config_filename): - self.out_dir = out_dir - self.dmi = dmi + super().__init__(out_dir, dmi, monitoring_config_filename) self.bmc: ILO = None self.ilo = None - self.monitoring_config_filename = monitoring_config_filename def detect(self) -> bool: return self.dmi.info("sys_vendor") == "HPE" diff --git a/hwbench/environment/vendors/hpe/ilorest.py b/hwbench/environment/vendors/hpe/ilorest.py index bc572fc..e458988 100644 --- a/hwbench/environment/vendors/hpe/ilorest.py +++ b/hwbench/environment/vendors/hpe/ilorest.py @@ -133,8 +133,9 @@ def list(self, select, filter=None, to_json=False): return get return json.loads(get) - def get_ip(self): + def get_bmc_ipv4(self): """Return the BMC IPV4 address""" + # If no url provided in the configuration file, let's detect it via ilorest bmc_netconfig = self.list( select="ethernetinterface", filter="id=1", to_json=True ) @@ -144,5 +145,3 @@ def get_ip(self): ipv4 = nc.get("IPv4Addresses") if ipv4: return ipv4[0].get("Address") - - h.fatal("Cannot detect BMC ip") diff --git a/hwbench/environment/vendors/mock.py b/hwbench/environment/vendors/mock.py index ea6e8e8..b97a316 100644 --- a/hwbench/environment/vendors/mock.py +++ b/hwbench/environment/vendors/mock.py @@ -12,8 +12,12 @@ class MockedBMC(BMC): - def get_ip(self) -> str: - return "1.2.3.4" + def get_url(self) -> str: + return "https://1.2.3.4" + + def detect(self): + self.firmware_version = "1.0.0" + self.model = "MockedBMC" def read_thermals( self, thermals: dict[str, dict[str, Temperature]] = {} @@ -78,10 +82,10 @@ def connect_redfish(self): class MockVendor(Vendor): - def __init__(self, out_dir, dmi, monitoring_config_filename=None): - self.out_dir = out_dir - self.dmi = dmi - self.monitoring_config_filename = monitoring_config_filename + def __init__( + self, out_dir, dmi, monitoring_config_filename="tests/mocked_monitoring.cfg" + ): + super().__init__(out_dir, dmi, monitoring_config_filename) self.bmc = MockedBMC(self.out_dir, self) def detect(self) -> bool: diff --git a/hwbench/environment/vendors/monitoring_device.py b/hwbench/environment/vendors/monitoring_device.py new file mode 100644 index 0000000..3fb5dec --- /dev/null +++ b/hwbench/environment/vendors/monitoring_device.py @@ -0,0 +1,133 @@ +import cachetools.func +import json +import logging +import redfish # type: ignore +from ...utils import helpers as h +from ...bench.monitoring_structs import ( + MonitorMetric, +) +from typing import Any + + +class MonitoringDevice: + def __init__(self, vendor): + self.vendor = vendor + self.redfish_obj = None + self.logged = False + self.firmware_version = "" + self.model = "" + self.serialnumber = "" + + def __del__(self): + if self.logged: + self.redfish_obj.logout() + + def get_firmware_version(self): + return self.firmware_version + + def get_model(self): + return self.model + + def get_serialnumber(self): + return self.serialnumber + + def get_url(self): + return self.vendor.monitoring_config_file.get( + self.pdu_section, "url", fallback="" + ) + + def detect(self): + """Detect monitoring device""" + self.firmware_version = "" + self.model = "" + self.serialnumber = "" + + def get_detect_string(self): + details = f"driver @ {self.get_url()} " + if self.get_model(): + details += f"Model: '{self.get_model()}' " + + if self.get_firmware_version(): + details += f"FW: '{self.get_firmware_version()}' " + + if self.get_serialnumber(): + details += f"Serial: '{self.get_serialnumber()}' " + return details.strip() + + def add_monitoring_value( + self, + monitoring_struct: dict[str, dict[str, MonitorMetric]], + context: Any, + metric: MonitorMetric, + name: str, + value: float, + ) -> dict[str, dict[str, MonitorMetric]]: + """This function add a new in the monitoring data structure.""" + if str(context) not in monitoring_struct: + monitoring_struct[str(context)] = {} + if name not in monitoring_struct[str(context)]: + monitoring_struct[str(context)][name] = metric + monitoring_struct[str(context)][name].add(value) + return monitoring_struct + + def get_driver_name(self) -> str: + """Return the driver name""" + return type(self).__name__ + + def dump(self) -> dict[str, str]: + """Return the dump of the drive""" + dump = {"driver": self.get_driver_name()} + if self.firmware_version: + dump["firmware_version"] = self.firmware_version + if self.model: + dump["model"] = self.model + if self.serialnumber: + dump["serial_number"] = self.serialnumber + if self.get_url(): + dump["url"] = self.get_url() + return dump + + def connect_redfish(self, username: str, password: str, device_url: str): + """Connect to the device using Redfish.""" + try: + if not device_url.startswith("https://"): + h.fatal("redfish url '{device_url}' must be an https url") + self.redfish_obj = redfish.redfish_client( + base_url=device_url, + username=username, + password=password, + default_prefix="/redfish/v1", + timeout=10, + ) + self.redfish_obj.login() + self.logged = True + except json.decoder.JSONDecodeError: + h.fatal("JSONDecodeError on {}".format(device_url)) + except redfish.rest.v1.RetriesExhaustedError: + h.fatal("RetriesExhaustedError on {}".format(device_url)) + except redfish.rest.v1.BadRequestError: + h.fatal("BadRequestError on {}".format(device_url)) + except redfish.rest.v1.InvalidCredentialsError: + h.fatal("Invalid credentials for {}".format(device_url)) + except Exception as exception: + h.fatal(type(exception)) + + @cachetools.func.ttl_cache(maxsize=128, ttl=1.5) + def get_redfish_url(self, url): + """Return the content of a Redfish url.""" + # The same url can be called several times like read_thermals() and read_fans() consuming the same redfish endpoint. + # To avoid multiplicating identical redfish calls, a ttl cache is implemented to avoid multiple redfish calls in a row. + # As we want to keep a possible high frequency (< 5sec) precision, let's consider the cache must live up to 1.5 seconds + try: + redfish = self.redfish_obj.get(url, None).dict + # Let's ignore errors and return empty objects + # It will be up to the caller to see there is no answer and process this + # {'error': {'code': 'iLO.0.10.ExtendedInfo', 'message': 'See @Message.ExtendedInfo for more information.', '@Message.ExtendedInfo': [{'MessageArgs': ['/redfish/v1/Chassis/enclosurechassis/'], 'MessageId': 'Base.1.4.ResourceMissingAtURI'}]}} + if redfish and "error" in redfish: + logging.error(f"Parsing redfish url {url} failed : {redfish}") + return {} + return redfish + except redfish.rest.v1.RetriesExhaustedError: + return None + except json.decoder.JSONDecodeError: + return None diff --git a/hwbench/environment/vendors/pdu.py b/hwbench/environment/vendors/pdu.py new file mode 100644 index 0000000..9ea93cd --- /dev/null +++ b/hwbench/environment/vendors/pdu.py @@ -0,0 +1,60 @@ +from .monitoring_device import MonitoringDevice +from ...utils import helpers as h +from ...bench.monitoring_structs import ( + Power, + PowerContext, +) + + +class PDU(MonitoringDevice): + def __init__(self, vendor, pdu_section): + super().__init__(vendor) + self.pdu_section = pdu_section + self.outlet = self.vendor.monitoring_config_file.get( + self.pdu_section, "outlet", fallback="" + ) + + def get_url(self): + url = super().get_url() + if not url: + h.fatal(f"Cannot find url for PDU {self.pdu_section}") + return url + + def get_name(self) -> str: + """Return the pdu name.""" + return self.pdu_section + + def connect_redfish(self): + """Connect to the PDU using Redfish.""" + username = self.vendor.monitoring_config_file.get( + self.pdu_section, "username", fallback="" + ) + if not username: + h.fatal(f"Cannot find a username for PDU {self.pdu_section}") + + password = self.vendor.monitoring_config_file.get( + self.pdu_section, "password", fallback="" + ) + if not password: + h.fatal(f"Cannot find a password for PDU {self.pdu_section}") + return super().connect_redfish(username, password, self.get_url()) + + def get_power(self): + """Return the power metrics.""" + return {} + + def read_power_consumption( + self, power_consumption: dict[str, dict[str, Power]] = {} + ) -> dict[str, dict[str, Power]]: + """Return power consumption from server""" + # Generic for now, could be override by vendors + if str(PowerContext.PDU) not in power_consumption: + power_consumption[str(PowerContext.PDU)] = {} # type: ignore[no-redef] + + if self.get_name() not in power_consumption[str(PowerContext.PDU)]: + power_consumption[str(PowerContext.PDU)][self.get_name()] = Power( + self.get_name() + ) + + # To be completed by drivers + return power_consumption diff --git a/hwbench/environment/vendors/pdus/raritan.py b/hwbench/environment/vendors/pdus/raritan.py new file mode 100644 index 0000000..ae0d1fb --- /dev/null +++ b/hwbench/environment/vendors/pdus/raritan.py @@ -0,0 +1,47 @@ +from ....bench.monitoring_structs import Power, PowerContext +from ....utils import helpers as h +from ..pdu import PDU + + +def init(vendor, pdu_section): + return Raritan(vendor, pdu_section) + + +class Raritan(PDU): + def __init__(self, vendor, pdu_section): + super().__init__(vendor, pdu_section) + self.outletgroup = self.vendor.monitoring_config_file.get( + self.pdu_section, "outletgroup", fallback="" + ) + if not self.outlet and not self.outletgroup: + h.fatal("PDU/Raritan: An outlet or an outletgroup must be defined.") + + if self.outlet and self.outletgroup: + h.fatal("PDU/Raritan: outlet and outletgroup are mutually exclusive.") + + def detect(self): + """Detect monitoring device""" + pdu_info = self.get_redfish_url("/redfish/v1/PowerEquipment/RackPDUs/1/") + self.firmware_version = pdu_info.get("FirmwareVersion") + self.model = pdu_info.get("Model") + self.serialnumber = pdu_info.get("SerialNumber") + + def get_power(self): + if self.outletgroup: + return self.get_redfish_url( + f"/redfish/v1/PowerEquipment/RackPDUs/1/OutletGroups/{self.outletgroup}/" + ) + else: + return self.get_redfish_url( + f"/redfish/v1/PowerEquipment/RackPDUs/1/Outlets/{self.outlet}/" + ) + + def read_power_consumption( + self, power_consumption: dict[str, dict[str, Power]] = {} + ) -> dict[str, dict[str, Power]]: + """Return power consumption from pdu""" + power_consumption = super().read_power_consumption(power_consumption) + power_consumption[str(PowerContext.PDU)][self.get_name()].add( + self.get_power().get("PowerWatts")["Reading"] + ) + return power_consumption diff --git a/hwbench/environment/vendors/vendor.py b/hwbench/environment/vendors/vendor.py index 50f2c87..a8f4555 100644 --- a/hwbench/environment/vendors/vendor.py +++ b/hwbench/environment/vendors/vendor.py @@ -1,222 +1,9 @@ import configparser -import cachetools.func -import json -import logging import os -import pathlib -import redfish # type: ignore from abc import ABC, abstractmethod -from typing import Any +from .bmc import BMC +from .pdu import PDU from ...utils import helpers as h -from ...utils.external import External -from ...bench.monitoring_structs import ( - FanContext, - Power, - PowerCategories, - PowerContext, - MonitorMetric, - Temperature, -) - - -class BMC(External): - def __init__(self, out_dir: pathlib.Path, vendor): - super().__init__(out_dir) - self.bmc = {} # type: dict[str, str] - self.monitoring_config_file: configparser.ConfigParser - self.redfish_obj = None - self.vendor = vendor - self.logged = False - - def __del__(self): - if self.logged: - self.redfish_obj.logout() - - def add_monitoring_value( - self, - monitoring_struct: dict[str, dict[str, MonitorMetric]], - context: Any, - metric: MonitorMetric, - name: str, - value: float, - ) -> dict[str, dict[str, MonitorMetric]]: - """This function add a new in the monitoring data structure.""" - if str(context) not in monitoring_struct: - monitoring_struct[str(context)] = {} - if name not in monitoring_struct[str(context)]: - monitoring_struct[str(context)][name] = metric - monitoring_struct[str(context)][name].add(value) - return monitoring_struct - - def run_cmd(self) -> list[str]: - return ["ipmitool", "lan", "print"] - - def parse_cmd(self, stdout: bytes, _stderr: bytes): - for row in stdout.split(b"\n"): - if b": " in row: - key, value = row.split(b": ", 1) - if key.strip(): - self.bmc[key.strip().decode("utf-8")] = value.strip().decode( - "utf-8" - ) - return self.bmc - - def run_cmd_version(self) -> list[str]: - return ["ipmitool", "-V"] - - def parse_version(self, stdout: bytes, _stderr: bytes) -> bytes: - self.version = stdout.split()[2] - return self.version - - @property - def name(self) -> str: - return "ipmitool-lan-print" - - def get_ip(self) -> str: - """Extract the BMC IP.""" - try: - ip = self.bmc["IP Address"] - except KeyError: - h.fatal("Cannot detect BMC ip") - - return ip - - def get_driver_name(self) -> str: - """Return the BMC driver name""" - return type(self).__name__ - - def dump(self) -> dict[str, str]: - """Return the dump of the BMC""" - return {"driver": self.get_driver_name()} - - def connect_redfish(self): - """Connect to the BMC using Redfish.""" - if not self.vendor.get_monitoring_config_filename(): - h.fatal("Missing monitoring configuration file, please use -m option.") - - if not os.path.isfile(self.vendor.get_monitoring_config_filename()): - h.fatal( - f"Monitoring configuration option ({self.vendor.get_monitoring_config_filename()}) is not a file or does not exists." - ) - self.monitoring_config_file = configparser.ConfigParser(allow_no_value=True) - self.monitoring_config_file.read(self.vendor.get_monitoring_config_filename()) - section_name = "" - sections = [self.vendor.name(), "default"] - for section in sections: - if section in self.monitoring_config_file.sections(): - section_name = section - break - if not section_name: - h.fatal( - f"Cannot find any section of {sections} in monitoring configuration file" - ) - - bmc_username = self.monitoring_config_file.get(section_name, "username") - bmc_password = self.monitoring_config_file.get(section_name, "password") - server_url = self.get_ip() - try: - if "https://" not in server_url: - server_url = "https://{}".format(server_url) - self.redfish_obj = redfish.redfish_client( - base_url=server_url, - username=bmc_username, - password=bmc_password, - default_prefix="/redfish/v1", - timeout=10, - ) - self.redfish_obj.login() - self.logged = True - except json.decoder.JSONDecodeError: - h.fatal("JSONDecodeError on {}".format(server_url)) - except redfish.rest.v1.RetriesExhaustedError: - h.fatal("RetriesExhaustedError on {}".format(server_url)) - except redfish.rest.v1.BadRequestError: - h.fatal("BadRequestError on {}".format(server_url)) - except redfish.rest.v1.InvalidCredentialsError: - h.fatal("Invalid credentials for {}".format(server_url)) - except Exception as exception: - h.fatal(type(exception)) - - @cachetools.func.ttl_cache(maxsize=128, ttl=1.5) - def get_redfish_url(self, url): - """Return the content of a Redfish url.""" - # The same url can be called several times like read_thermals() and read_fans() consuming the same redfish endpoint. - # To avoid multiplicating identical redfish calls, a ttl cache is implemented to avoid multiple redfish calls in a row. - # As we want to keep a possible high frequency (< 5sec) precision, let's consider the cache must live up to 1.5 seconds - try: - redfish = self.redfish_obj.get(url, None).dict - # Let's ignore errors and return empty objects - # It will be up to the caller to see there is no answer and process this - # {'error': {'code': 'iLO.0.10.ExtendedInfo', 'message': 'See @Message.ExtendedInfo for more information.', '@Message.ExtendedInfo': [{'MessageArgs': ['/redfish/v1/Chassis/enclosurechassis/'], 'MessageId': 'Base.1.4.ResourceMissingAtURI'}]}} - if redfish and "error" in redfish: - logging.error(f"Parsing redfish url {url} failed : {redfish}") - return {} - return redfish - except redfish.rest.v1.RetriesExhaustedError: - return None - except json.decoder.JSONDecodeError: - return None - - def get_thermal(self): - return {} - - def read_thermals( - self, thermals: dict[str, dict[str, Temperature]] = {} - ) -> dict[str, dict[str, Temperature]]: - """Return thermals from server""" - # To be implemented by vendors - return {} - - def read_fans( - self, fans: dict[str, dict[str, MonitorMetric]] = {} - ) -> dict[str, dict[str, MonitorMetric]]: - """Return fans from server""" - # Generic for now, could be override by vendors - if str(FanContext.FAN) not in fans: - fans[str(FanContext.FAN)] = {} # type: ignore[no-redef] - for f in self.get_thermal().get("Fans"): - name = f["Name"] - if name not in fans[str(FanContext.FAN)]: - fans[str(FanContext.FAN)][name] = MonitorMetric( - f["Name"], f["ReadingUnits"] - ) - fans[str(FanContext.FAN)][name].add(f["Reading"]) - return fans - - def get_power(self): - """Return the power metrics.""" - return {} - - def read_power_consumption( - self, power_consumption: dict[str, dict[str, Power]] = {} - ) -> dict[str, dict[str, Power]]: - """Return power consumption from server""" - # Generic for now, could be override by vendors - if str(PowerContext.BMC) not in power_consumption: - power_consumption[str(PowerContext.BMC)] = { - str(PowerCategories.SERVER): Power(str(PowerCategories.SERVER)) - } # type: ignore[no-redef] - - power_consumption[str(PowerContext.BMC)][str(PowerCategories.SERVER)].add( - self.get_power().get("PowerControl")[0]["PowerConsumedWatts"] - ) - return power_consumption - - def read_power_supplies( - self, power_supplies: dict[str, dict[str, Power]] = {} - ) -> dict[str, dict[str, Power]]: - """Return power supplies power from server""" - # Generic for now, could be override by vendors - if str(PowerContext.BMC) not in power_supplies: - power_supplies[str(PowerContext.BMC)] = {} # type: ignore[no-redef] - for psu in self.get_power().get("PowerSupplies"): - psu_name = psu["Name"].split()[0] - if psu["Name"] not in power_supplies[str(PowerContext.BMC)]: - power_supplies[str(PowerContext.BMC)][psu["Name"]] = Power(psu_name) - power_supplies[str(PowerContext.BMC)][psu["Name"]].add( - psu["PowerInputWatts"] - ) - return power_supplies class Vendor(ABC): @@ -224,6 +11,7 @@ def __init__(self, out_dir, dmi, monitoring_config_filename): self.out_dir = out_dir self.dmi = dmi self.bmc: BMC = None + self.pdus: list[PDU] = [] self.monitoring_config_filename = monitoring_config_filename @abstractmethod @@ -245,12 +33,71 @@ def name(self) -> str: def get_monitoring_config_filename(self): return self.monitoring_config_filename + def _load_vendor(self, directory: str, vendor: str): + """Load the vendors//check module.""" + from importlib import import_module + from importlib.util import find_spec + + vendor_modulename = f"hwbench.environment.vendors.{directory}.{vendor}" + if not find_spec(vendor_modulename): + h.fatal("cannot_find vendor module {}".format(vendor_modulename)) + + return import_module(vendor_modulename) + def prepare(self): """If the vendor needs some specific code to init itself.""" if not self.bmc: - self.bmc = BMC(self.out_dir, self) + self.bmc = BMC(self.out_dir, self, self.find_monitoring_sections("BMC")) self.bmc.run() + if not self.pdus: + pdu_sections = self.find_monitoring_sections("PDU") + for pdu_section in pdu_sections: + pdu_driver_name = self.monitoring_config_file.get( + pdu_section, "driver", fallback="" + ) + if not pdu_driver_name: + h.fatal("PDU configuration requires a driver.") + pdu_driver = self._load_vendor("pdus", pdu_driver_name.lower()).init( + self, pdu_section + ) + self.pdus.append(pdu_driver) def get_bmc(self) -> BMC: """Return the BMC object""" return self.bmc + + def get_pdus(self) -> list[PDU]: + """Return a list of PDUs object""" + return self.pdus + + def find_monitoring_sections( + self, section_type: str, sections_list=[], max_sections=0 + ): + """Return sections of section_type from the monitoring configuration file""" + sections = [] + if not self.get_monitoring_config_filename(): + h.fatal("Missing monitoring configuration file, please use -m option.") + + if not os.path.isfile(self.get_monitoring_config_filename()): + h.fatal( + f"Monitoring configuration option ({self.get_monitoring_config_filename()}) is not a file or does not exists." + ) + self.monitoring_config_file = configparser.ConfigParser(allow_no_value=True) + self.monitoring_config_file.read(self.get_monitoring_config_filename()) + + # If no sections list is provided, let's consider all of them + if not len(sections_list): + sections_list = self.monitoring_config_file.sections() + + for section in sections_list: + if section in self.monitoring_config_file.sections(): + if ( + self.monitoring_config_file.get(section, "type", fallback="") + != section_type + ): + continue + sections.append(section) + if len(sections) == max_sections: + break + + return sections diff --git a/hwbench/tests/mocked_monitoring.cfg b/hwbench/tests/mocked_monitoring.cfg new file mode 100644 index 0000000..c808fc5 --- /dev/null +++ b/hwbench/tests/mocked_monitoring.cfg @@ -0,0 +1,4 @@ +[default] +username=Administrator +password=mocked +type=BMC \ No newline at end of file diff --git a/monitoring.cfg b/monitoring.cfg index c810be5..6c044f4 100644 --- a/monitoring.cfg +++ b/monitoring.cfg @@ -5,11 +5,31 @@ #[HPE] #username=Administrator #password=YOURPASSWORD +#type=BMC #[DELL] #username=Administrator #password=YOURPASSWORD +#type=BMC [default] username=Administrator password=YOURPASSWORD +type=BMC + +# This part is used to define PDUs +#[PDU_1] +#username=admin +#password=admin +#type=PDU +#driver=raritan +#url=http://mypdu/ +#outlet=21 + +#[PDU_with_grouped_outlets] +#username=admin +#password=admin +#type=PDU +#driver=raritan +#url=https://mypdu/ +#outletgroup=1 \ No newline at end of file