criteo · ErwanAliasr1 · Jul 15, 2024 · Jul 5, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/README.md b/README.md
@@ -40,7 +40,13 @@ This release supports Dell and HPE servers and collects:
 
 This feature uses [Redfish](https://www.dmtf.org/standards/redfish) protocol with both generic and OEM-specific endpoints.
 
-For more details and usage, see the specific documentation.
+If the server is connected to a [PDU](https://en.wikipedia.org/wiki/Power_distribution_unit), and only if the monitoring feature is enabled,
+hwbench can collect power metrics from it.
+
+This release supports the following brands:
+ - Raritan
+
+For more details and usage, see the specific [documentation](./documentation/monitoring.md)
 
 # How can results be analyzed?
 **hwgraph** tool, bundled in the same repository, generates graphs from **hwbench** output files.

diff --git a/documentation/monitoring.md b/documentation/monitoring.md
@@ -0,0 +1,113 @@
+# Monitoring
+
+hwbench is using a specific monitoring engine to collect data from several sources :
+- BMC
+- PDU
+- Turbostat
+
+Regarding the source, power, frequencies, thermal or cooling metrics are collected.
+
+# Concept
+During each benchmark, and if the monitoring is enabled, metrics are collected every 2 seconds and aggregated every 10 seconds to get statistics over this period of time.
+
+At the end of the benchmark the monitoring metrics are added in the a result file. hwgraph will use them to plot how these components behave during the benchmark.
+
+# Usage
+To enable the monitoring feature, just set the `monitor` directive in the configuration file.
+
+As per this release, only `monitor=all` is supported. In future releases, it will be possible to list the source to be considered.
+
+# Configuration file
+When monitoring is enabled, the `-m <config_file>` option must be used to describe the server's configuration.
+This file is separated from the job file as it could be specific to each host.
+
+Each source is defined like the following :
+
+```
+  [section_name]
+  username=<username>
+  password=<password>
+  type=<type>
+  url=<url>
+ ```
+
+ ## BMC
+ When defining a BMC, the `type` must be set to `BMC`. The `section_name` could the hardware vendor name like `DELL`, `HPE`, or `default`.
+
+ A typical example looks like :
+
+```
+[HPE]
+username=Administrator
+password=YOURPASSWORD
+type=BMC
+```
+
+**Note**: if no ``url`` parameter is provided, it will be automatically detected at runtime via ipmitool tool (or ilorest on HPE systems).
+
+A single BMC configuration will be used per server and vendor specific will be selected first if matching the running hardware.
+
+The BMC code is using `redfish` endpoints to monitor the server. Vendor specific endpoints can be used in addition of the generic ones to get all meaningful metrics.
+
+Hwbench monitoring code requires the BMC to be reachable from the host.
+
+## PDU
+When defining a PDU,
+- a `section_name`, a user-defined value to represent this PDU
+- the ``type`` must be set to `PDU`.
+- a ``driver`` must be chosen
+- an ``URL`` is required
+- the ``outlet`` port must be selected
+
+ A typical example looks like :
+
+```
+[myPDU]
+username=admin
+password=admin
+type=PDU
+driver=raritan
+url=http://mypdu/
+outlet=21
+```
+
+**Note**: Several PDU configurations can be defined and used simultaneously.
+
+
+### Driver
+There exist many PDU providers and the software quality may vary a lot and so the protocols. To ensure a good compatibility with them, drivers can be added to hwbench.
+
+For this release, only **raritan** driver exists but as it uses some redfish endpoints, it might work on other products.
+
+If you have tested it on some other PDUs or have created a custom driver, feel free to push a PR for review.
+
+**Note**: The Raritan driver only exports the power in Watts but can be expanded easily to get more metrics.
+
+
+
+## URL
+The url cannot be automatically detected so it must be provided to hwbench.
+
+## Outlet
+This directive selects the physical outlet where the server is connected.
+
+## Outletgroup
+Some products support outlet groups where outlets from different PDUs are grouped in a single `outletgroup`.
+
+If the PDU supports it, the `outletgroup` can be used to specify which one to use.
+A typical example looks like :
+
+```
+[PDU_with_grouped_outlets]
+username=admin
+password=admin
+type=PDU
+driver=raritan
+url=https://mypdu/
+outletgroup=1
+```
+
+**Note**: ``outlet`` and ``outletgroup` are mutually exclusive.
+
+# Turbostat
+Turbostat will be automatically used on x86_64 systems if already installed on the server with release >= 2022.04.16. No configuration is required.
diff --git a/graph/hwgraph.py b/graph/hwgraph.py
@@ -90,7 +90,7 @@ def compare_traces(args) -> None:
 def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) -> int:
     rendered_graphs = 0
     bench = trace.bench(bench_name)
-    for metric_name in ["BMC", "CPU"]:
+    for metric_name in ["BMC", "CPU", "PDU"]:
         metrics = bench.get_component(Metrics.MONITOR, metric_name)
         if metrics:
             for metric in metrics:
@@ -148,6 +148,28 @@ def graph_cpu(args, trace: Trace, bench_name: str, output_dir) -> int:
     return rendered_graphs
 
 
+def graph_pdu(args, trace: Trace, bench_name: str, output_dir) -> int:
+    rendered_graphs = 0
+    bench = trace.bench(bench_name)
+    pdu_graphs = {}
+    pdu_graphs["PDU power reporting"] = {Metrics.POWER_CONSUMPTION: "PDU"}
+    for graph_name in pdu_graphs:
+        # Let's render the performance, perf_per_temp, perf_per_watt graphs
+        for metric, filter in pdu_graphs[graph_name].items():
+            for second_axis in [None, Metrics.THERMAL, Metrics.POWER_CONSUMPTION]:
+                rendered_graphs += generic_graph(
+                    args,
+                    output_dir,
+                    bench,
+                    metric,
+                    graph_name,
+                    second_axis,
+                    filter=filter,
+                )
+
+    return rendered_graphs
+
+
 def graph_thermal(args, trace: Trace, bench_name: str, output_dir) -> int:
     rendered_graphs = 0
     rendered_graphs += generic_graph(
@@ -219,6 +241,7 @@ def valid_traces(args):
             )
             rendered_graphs += graph_fans(args, trace, bench_name, output_dir)
             rendered_graphs += graph_cpu(args, trace, bench_name, output_dir)
+            rendered_graphs += graph_pdu(args, trace, bench_name, output_dir)
             rendered_graphs += graph_thermal(args, trace, bench_name, output_dir)
 
     return rendered_graphs

diff --git a/hwbench/bench/benchmarks.py b/hwbench/bench/benchmarks.py
@@ -158,6 +158,10 @@ def __schedule_benchmark(
         # If job needs monitoring, let's create it
         if monitoring_config != "none" and not self.monitoring:
             self.hardware.vendor.get_bmc().connect_redfish()
+            self.hardware.vendor.get_bmc().detect()
+            for pdu in self.hardware.vendor.get_pdus():
+                pdu.connect_redfish()
+                pdu.detect()
             self.monitoring = Monitoring(self.out_dir, self.jobs_config, self.hardware)
 
         # For each stressor, add a benchmark object to the list

diff --git a/hwbench/bench/monitoring.py b/hwbench/bench/monitoring.py
@@ -59,6 +59,7 @@ def prepare(self):
         """Preparing the monitoring"""
         v = self.vendor
         bmc = self.vendor.get_bmc()
+        pdus = self.vendor.get_pdus()
 
         def check_monitoring(source: str, metric: Metrics):
             data = self.get_metric(metric)
@@ -83,9 +84,14 @@ def check_monitoring(source: str, metric: Metrics):
             check_monitoring("turbostat", Metrics.FREQ)
 
         print(
-            f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} driver @ {bmc.get_ip()}"
+            f"Monitoring/BMC: initialize {v.name()} vendor with {bmc.get_driver_name()} {bmc.get_detect_string()}"
         )
 
+        for pdu in pdus:
+            print(
+                f"Monitoring/PDU: initialize {pdu.get_name()} with {pdu.get_driver_name()} {pdu.get_detect_string()}"
+            )
+
         # - checking if the bmc monitoring works
         # These calls will also initialize the datastructures out of the monitoring loop
         self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL))
@@ -104,6 +110,12 @@ def check_monitoring(source: str, metric: Metrics):
         )
         check_monitoring("BMC", Metrics.POWER_SUPPLIES)
 
+        # - checking if pdu monitoring works
+        if pdus:
+            for pdu in pdus:
+                pdu.read_power_consumption(self.get_metric(Metrics.POWER_CONSUMPTION))
+            check_monitoring("PDU", Metrics.POWER_CONSUMPTION)
+
     def __monitor_bmc(self):
         """Monitor the bmc metrics"""
         self.vendor.get_bmc().read_thermals(self.get_metric(Metrics.THERMAL))
@@ -115,6 +127,11 @@ def __monitor_bmc(self):
             self.get_metric(Metrics.POWER_SUPPLIES)
         )
 
+    def __monitor_pdus(self):
+        """Monitor the PDU metrics"""
+        for pdu in self.vendor.get_pdus():
+            pdu.read_power_consumption(self.get_metric(Metrics.POWER_CONSUMPTION))
+
     def __compact(self):
         """Compute statistics"""
         for metric_name, metric_type in self.metrics.items():
@@ -158,6 +175,7 @@ def __monitor(self, precision: int, frequency: int, duration: int):
         self.metrics[str(MonitoringMetadata.ITERATION_TIME)] = frequency * precision
         self.metrics[str(Metrics.MONITOR)] = {
             "BMC": {"Polling": MonitorMetric("Polling", "ms")},
+            "PDU": {"Polling": MonitorMetric("Polling", "ms")},
             "CPU": {"Polling": MonitorMetric("Polling", "ms")},
         }
         # When will we hit "duration" ?
@@ -187,22 +205,31 @@ def next_iter():
 
             start_bmc = self.get_monotonic_clock()
             self.__monitor_bmc()
-            end_bmc = self.get_monotonic_clock()
+            end_monitoring = self.get_monotonic_clock()
             # Let's monitor the time spent at monitoring the BMC
             self.get_metric(Metrics.MONITOR)["BMC"]["Polling"].add(
-                (end_bmc - start_bmc) * 1e-6
+                (end_monitoring - start_bmc) * 1e-6
             )
 
+            if self.vendor.get_pdus():
+                start_pdu = self.get_monotonic_clock()
+                self.__monitor_pdus()
+                end_monitoring = self.get_monotonic_clock()
+                # Let's monitor the time spent at monitoring the PDUs
+                self.get_metric(Metrics.MONITOR)["PDU"]["Polling"].add(
+                    (end_monitoring - start_pdu) * 1e-6
+                )
+
             # We compute the time spent since we started this iteration
-            monitoring_duration = end_bmc - start_time
+            monitoring_duration = end_monitoring - start_time
 
             # Based on the time passed, let's compute the amount of sleep time
             # to keep in sync with the expected precision
             sleep_time_ns = next_iter() - self.get_monotonic_clock()  # stime
             sleep_time = sleep_time_ns / 1e9
 
             # If the the current time + sleep_time is above the total duration (we accept up to 500ms overdue)
-            if (end_bmc + monitoring_duration + sleep_time_ns) > (
+            if (end_monitoring + monitoring_duration + sleep_time_ns) > (
                 end_of_run + 0.5 * 1e9
             ):
                 # We can stop the monitoring, no more measures will be done

diff --git a/hwbench/bench/monitoring_structs.py b/hwbench/bench/monitoring_structs.py
@@ -150,6 +150,7 @@ def __str__(self) -> str:
 
 class PowerContext(Enum):
     BMC = "BMC"
+    PDU = "PDU"
     CPU = "CPU"
 
     def __str__(self) -> str:
@@ -166,6 +167,7 @@ class PowerCategories(Enum):
     INFRASTRUCTURE = "Infrastructure"  # = Chassis - servers (fans, pdb, ..)
     SERVERINCHASSIS = "ServerInChassis"  # One server + its part of the chassis
     SERVER = "Server"  # One server
+    PDU = "Pdu"
 
     def __str__(self) -> str:
         return str(self.value)

diff --git a/hwbench/environment/hardware.py b/hwbench/environment/hardware.py
@@ -49,11 +49,15 @@ def __init__(self, out_dir: pathlib.Path, monitoring_config):
         External_Simple(self.out_dir, ["ipmitool", "sdr"], "ipmitool-sdr")
 
     def dump(self) -> dict[str, Optional[str | int] | dict]:
-        return {
+        dump = {
             "dmi": self.dmi.dump(),
             "cpu": self.cpu.dump(),
             "bmc": self.vendor.get_bmc().dump(),
+            "pdu": {},
         }
+        for pdu in self.vendor.get_pdus():
+            dump["pdu"][pdu.get_name()] = pdu.dump()
+        return dump
 
     def cpu_flags(self) -> list[str]:
         return self.cpu.get_flags()

diff --git a/hwbench/environment/test_dell.py b/hwbench/environment/test_dell.py
@@ -16,7 +16,7 @@
 
 class TestDell(TestVendors):
     def __init__(self, *args, **kwargs):
-        super().__init__(Dell("", None, None), *args, **kwargs)
+        super().__init__(Dell("", None, "tests/mocked_monitoring.cfg"), *args, **kwargs)
         self.path = "tests/vendors/Dell/C6615/"
 
     def setUp(self):

diff --git a/hwbench/environment/test_hpe.py b/hwbench/environment/test_hpe.py
@@ -16,7 +16,7 @@
 
 class TestGenericHpe(TestVendors):
     def __init__(self, path: str, *args, **kwargs):
-        super().__init__(Hpe("", None, None), *args, **kwargs)
+        super().__init__(Hpe("", None, "tests/mocked_monitoring.cfg"), *args, **kwargs)
         self.path = path
 
     def setUp(self):

diff --git a/hwbench/environment/test_parse.py b/hwbench/environment/test_parse.py
@@ -259,4 +259,4 @@ def test_ipmitool_parsing(self):
         stdout = (d / "stdout").read_bytes()
         stderr = (d / "stderr").read_bytes()
         test_target.parse_cmd(stdout, stderr)
-        assert test_target.get_ip() == "10.168.97.137"
+        assert test_target.get_url() == "https://10.168.97.137"
diff --git a/hwbench/environment/test_vendors.py b/hwbench/environment/test_vendors.py
@@ -63,10 +63,15 @@ def setUp(self):
         # Vendors will override this function to add their specifics
         # Once done, they will this helper
         self.install_patch(
-            "hwbench.environment.vendors.vendor.BMC.connect_redfish",
+            "hwbench.environment.vendors.bmc.BMC.connect_redfish",
             PATCH_TYPES.RETURN_VALUE,
             None,
         )
+        self.install_patch(
+            "hwbench.environment.vendors.vendor.Vendor.find_monitoring_sections",
+            PATCH_TYPES.RETURN_VALUE,
+            [],
+        )
         self.get_vendor().prepare()
 
     # tearDown is called at the end of the test