Add docstrings

tilman151 · Mar 22, 2024 · 2b417dd · 2b417dd
1 parent 757d9b6
commit 2b417dd
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 48 deletions.
diff --git a/rul_datasets/reader/ncmapss.py b/rul_datasets/reader/ncmapss.py
@@ -123,42 +123,42 @@ def __init__(
         truncate_degraded_only: bool = False,
         resolution_seconds: int = 1,
         padding_value: float = 0.0,
-        scaling_range: [float, float] = None,
+        scaling_range: Optional[Tuple[int, int]] = (0, 1),
     ) -> None:
         """
-        Create a new reader for the New C-MAPSS dataset. The maximum RUL value is set
-       to 65 by default. The default channels are the four operating conditions,
-        the 14 physical, and 14 virtual sensors in this order.
-
-        The default window size is, by default, the longest flight cycle in the
-        sub-dataset. Shorter cycles are padded on the left. The default padding value
-        is zero but can be overridden, e.g., as -1 to make filtering for padding easier
-        later on.
-
-        The default `run_split_dist` is the same as in the original dataset, but with
-        the last unit of the original train split designated for validation.
-
-        If the features are downsampled in time, the default window size is
-        automatically adjusted to `window_size // resolution_seconds`. Any manually
-        set `window_size` needs to take this into account as it is applied after
-        downsampling.
-
-        For more information about using readers, refer to the [reader]
-        [rul_datasets.reader] module page.
-
-        Args:
-            fd: The sub-dataset to use. Must be in `[1, 7]`.
-            max_rul: The maximum RUL value.
-            percent_broken: The maximum relative degradation per unit.
-            percent_fail_runs: The percentage or index list of available units.
-            feature_select: The indices of the features to use.
-            truncate_val: Truncate the validation data with `percent_broken`, too.
-            run_split_dist: The assignment of units to each split.
-            truncate_degraded_only: Only truncate the degraded part of the data
-                                    (< max RUL).
-            resolution_seconds: The number of consecutive seconds to average over for
-                                downsampling.
-            padding_value: The value to use for padding the flight cycles.
+         Create a new reader for the New C-MAPSS dataset. The maximum RUL value is set
+        to 65 by default. The default channels are the four operating conditions,
+         the 14 physical, and 14 virtual sensors in this order.
+
+         The default window size is, by default, the longest flight cycle in the
+         sub-dataset. Shorter cycles are padded on the left. The default padding value
+         is zero but can be overridden, e.g., as -1 to make filtering for padding easier
+         later on.
+
+         The default `run_split_dist` is the same as in the original dataset, but with
+         the last unit of the original train split designated for validation.
+
+         If the features are downsampled in time, the default window size is
+         automatically adjusted to `window_size // resolution_seconds`. Any manually
+         set `window_size` needs to take this into account as it is applied after
+         downsampling.
+
+         For more information about using readers, refer to the [reader]
+         [rul_datasets.reader] module page.
+
+         Args:
+             fd: The sub-dataset to use. Must be in `[1, 7]`.
+             max_rul: The maximum RUL value.
+             percent_broken: The maximum relative degradation per unit.
+             percent_fail_runs: The percentage or index list of available units.
+             feature_select: The indices of the features to use.
+             truncate_val: Truncate the validation data with `percent_broken`, too.
+             run_split_dist: The assignment of units to each split.
+             truncate_degraded_only: Only truncate the degraded part of the data
+                                     (< max RUL).
+             resolution_seconds: The number of consecutive seconds to average over for
+                                 downsampling.
+             padding_value: The value to use for padding the flight cycles.
         """
         super().__init__(
             fd,
@@ -217,13 +217,15 @@ def prepare_data(self) -> None:
         """
         if not os.path.exists(self._NCMAPSS_ROOT):
             _download_ncmapss(self._NCMAPSS_ROOT)
-        #if not os.path.exists(self._get_scaler_path()):
-        features, _, _ = self._load_data("dev")
-        scaler = scaling.fit_scaler(features, MinMaxScaler())
-        scaling.save_scaler(scaler, self._get_scaler_path())
+        if not os.path.exists(self._get_scaler_path()):
+            features, _, _ = self._load_data("dev")
+            scaler = scaling.fit_scaler(features, MinMaxScaler(self.scaling_range))
+            scaling.save_scaler(scaler, self._get_scaler_path())
 
     def _get_scaler_path(self):
-        file_name = f"scaler_{self.fd}_{self.run_split_dist['dev']}.pkl"
+        file_name = (
+            f"scaler_{self.fd}_{self.run_split_dist['dev']}_{self.scaling_range}.pkl"
+        )
         file_path = os.path.join(self._NCMAPSS_ROOT, file_name)
 
         return file_path
@@ -304,7 +306,7 @@ def _select_units(self, units, split):
         return [units[i] for i in self.run_split_dist[split]]
 
     def _window_by_cycle(
-            self, features: np.ndarray, targets: np.ndarray, auxiliary: np.ndarray
+        self, features: np.ndarray, targets: np.ndarray, auxiliary: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
         cycle_end_idx = self._get_end_idx(auxiliary[:, 1])
         split_features = np.split(features, cycle_end_idx[:-1])

diff --git a/tests/reader/test_ncmapps.py b/tests/reader/test_ncmapps.py
diff --git a/tests/reader/test_ncmapss.py b/tests/reader/test_ncmapss.py
@@ -47,20 +47,19 @@ def test_prepare_data(should_run, mocker):
         mock_save_scaler.assert_not_called()
 
 
-
 @pytest.mark.needs_data
-@pytest.mark.parametrize("scaling_range", [(-1.0, 1.0), (0.0, 2.0)])
+@pytest.mark.parametrize("scaling_range", [(-1, 1), (0, 1)])
 def test_scaling_range(scaling_range):
     reader = NCmapssReader(fd=1, scaling_range=scaling_range)
     reader.prepare_data()
     features, _ = reader.load_split("dev")
 
-    reader = NCmapssReader(fd=1, scaling_range=(0, 1))
-    reader.prepare_data()
-    features_default, _ = reader.load_split("dev")
-
-   assert not np.array_equal(features[0][:, :, 1], features_default[0][:, :, 1])
-
+    min_val, max_val = scaling_range
+    for feature in features:
+        flat_features = feature.flatten()
+        np.testing.assert_almost_equal(
+            flat_features, np.clip(flat_features, min_val, max_val)
+        )
 
 
 @pytest.mark.needs_data