Some small bugfixes, documentation update

MStarmans91 · Nov 25, 2024 · 461b744 · 461b744
1 parent f47d3f7
commit 461b744
Show file tree

Hide file tree

Showing 8 changed files with 100 additions and 53 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -10,7 +10,13 @@ and this project adheres to `Semantic Versioning <http://semver.org/>`_
 Unreleased
 ------------------
 
+Changed
+~~~~~~~
+- Remove HDF5 validity check took to much memory.
 
+Fixed
+~~~~~
+- Minor bug in statistical testing for GLCMMS features.
 
 3.6.3 - 2023-08-15
 ------------------

diff --git a/WORC/classification/crossval.py b/WORC/classification/crossval.py
@@ -865,7 +865,8 @@ def test_RS_Ensemble(estimator_input, X_train, Y_train, X_test, Y_test,
             # Get the mean performances and get new ranking
             F1_validation = estimator.cv_results_['mean_test_score']
             F1_validation = [F1_validation[i] for i in selected_workflows]
-            workflow_ranking = np.argsort(np.asarray(F1_validation)).tolist()[::-1]  # Normally, rank from smallest to largest, so reverse
+            # workflow_ranking = np.argsort(np.asarray(F1_validation)).tolist()[::-1]  # Outdated
+            workflow_ranking = np.argsort(np.where(np.isnan(F1_validation), -np.inf, F1_validation)).tolist()[::-1]  # Normally, rank from smallest to largest, so reverse
             workflow_ranking = workflow_ranking[0:maxlen]  # only maxlen estimators needed for ensembling tests
             F1_validation = [F1_validation[i] for i in workflow_ranking]
 
@@ -917,8 +918,8 @@ def fitvalidationestimator(parameters, train, test):
             F1_training = [F1_training[i] for i in selected_workflows]
             F1_training = [F1_training[i] for i in workflow_ranking]
 
-            performances[f'Mean training F1-score {key} top {maxlen}'] = F1_validation
-            performances[f'Mean validation F1-score {key} top {maxlen}'] = F1_training
+            performances[f'Mean training F1-score {key} top {maxlen}'] = F1_training
+            performances[f'Mean validation F1-score {key} top {maxlen}'] = F1_validation
 
             for ensemble in ensembles:
                 if isinstance(ensemble, int):

diff --git a/WORC/doc/static/developerdocumentation.rst b/WORC/doc/static/developerdocumentation.rst
@@ -1,6 +1,29 @@
 Developer documentation
 =======================
 
+Information on the `fastr`` workflow engine
+---------------------------------------------
+The `WORC` toolbox makes use of the `fastr` package [1]_, an automated workflow engine.
+`fastr` does not provide any actual implementation of the required (radiomics) algorithms,
+but serves as a computational workflow engine, which has several advantages.
+
+Firstly, `fastr` requires workflows to be modular and split into standardized components
+or *tools*, with standardized inputs and outputs. This nicely connects to the modular design of `WORC`, for which we therefore wrapped each component as a tool in `fastr`. Alternating between feature extraction toolboxes can be easily done by changing a single field in the `WORC` toolbox configuration.
+
+Second, provenance is automatically tracked by `fastr` to facilitate repeatability and reproducibility.
+
+Third, `fastr` offers support for multiple execution plugins in order to be able to
+execute the same workflow on different computational resources or clusters. Examples
+include linear execution, local threading on multiple CPUs, and SLURM [2]_.
+
+Fourth, `fastr` is agnostic to software language. Hence, instead of restricting the
+user to a single programming language, algorithms (e.g., feature toolboxes) can be
+supplied in a variety of languages such as `Python`, `Matlab`, `R`, and command line executables.
+
+Fifth, `fastr` provides a variety of import and export plugins for loading and saving
+data. Besides using the local file storage, these include the use of `XNAT` [3]_.
+
+
 Adding a feature processing toolbox
 -----------------------------------
 We suggest to use the wrapping we did around the PyRadiomics toolbox as an example.
@@ -114,4 +137,16 @@ to follow or even copy-paste this example to add your own tools.
    `the fastr documentation <https://fastr.readthedocs.io/en/stable/static/quick_start.html#creating-a-simple-network/>`_,
    but in principle you can just copy paste again the parts of the plotting of the ROC curve. Make sure you add: the
    additional sources (inputs) your tool requires if they are not already in the rest of WORC, the actual tool you made,
-   and sinks (outputs) so the output is also actually stored when your tool is done in an output folder.
+   and sinks (outputs) so the output is also actually stored when your tool is done in an output folder.
+
+
+.. _references:
+
+References
+==========
+
+.. [1] Achterberg, H. C., Koek, M., & Niessen, W. J. (2016). *Fastr: A Workflow Engine for Advanced Data Flows in Medical Image Analysis*. Frontiers in ICT, 3, 15. https://doi.org/10.3389/fict.2016.00015
+
+.. [2] Yoo, A. B., Jette, M. A., & Grondona, M. (2003). *SLURM: Simple Linux Utility for Resource Management*. Job Scheduling Strategies for Parallel Processing, Lecture Notes in Computer Science, 2862, 44–60. https://doi.org/10.1007/10968987_3
+
+.. [3] Marcus, D. S., Olsen, T. R., Ramaratnam, M., & Buckner, R. L. (2007). *The extensible neuroimaging archive toolkit*. Neuroinformatics, 5(1), 11–33. https://doi.org/10.1385/NI:5:1:11
diff --git a/WORC/doc/static/faq.rst b/WORC/doc/static/faq.rst
@@ -78,6 +78,55 @@ for the NGTDM:
 See also my fork of PyRadiomics, which you can also install to fix the issue:
 https://github.com/MStarmans91/pyradiomics.
 
+I get (many) errors related to PyRadiomics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Both based on our own experience, feedback from WORC users, and the Github issues, PyRadiomics 3.1.0 is extremely buggy.
+If you are using this version, the errors you get may relate to this. We therefore recommend to use the latest
+stable version, 3.0.1.
+
+Error: ``ValueError: Image/Mask geometry mismatch. Potential fix: increase tolerance using geometryTolerance, see Documentation:Usage:Customizing the Extraction:Settings:geometryTolerance for more information"``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The full error will be similar to the following:
+
+.. code-block:: python
+
+  Traceback (most recent call last):
+    File "...\lib\site-packages\radiomics\imageoperations.py", line 228, in checkMask
+      lsif.Execute(imageNode, maskNode)
+    File "...\lib\site-packages\SimpleITK\SimpleITK.py", line 16078, in Execute
+      return _SimpleITK.LabelStatisticsImageFilter_Execute(self, image, labelImage)
+  RuntimeError: Exception thrown in SimpleITK LabelStatisticsImageFilter_Execute: d:\a\1\sitk-build\itk-prefix\include\itk-5.1\itkImageSink.hxx:242:
+  itk::ERROR: itk::ERROR: LabelStatisticsImageFilter(00000280C42E6A10): Inputs do not occupy the same physical space!
+  InputImage Origin: [-1.7624083e+01, 9.7990314e+00, -5.3576663e+01], InputImagePrimary Origin: [-1.7623698e+01, 9.7988536e+00, -5.3576664e+01]
+          Tolerance: 1.0000000e-04
+
+
+  During handling of the above exception, another exception occurred:
+
+  Traceback (most recent call last):
+    File "...\lib\site-packages\radiomics\scripts\segment.py", line 70, in _extractFeatures
+      feature_vector.update(extractor.execute(imageFilepath, maskFilepath, label, label_channel))
+    File "...\lib\site-packages\radiomics\featureextractor.py", line 276, in execute
+      boundingBox, correctedMask = imageoperations.checkMask(image, mask, **_settings)
+    File "...\lib\site-packages\radiomics\imageoperations.py", line 243, in checkMask
+      raise ValueError('Image/Mask geometry mismatch. Potential fix: increase tolerance using geometryTolerance, '
+  ValueError: Image/Mask geometry mismatch. Potential fix: increase tolerance using geometryTolerance, see Documentation:Usage:Customizing the Extraction:Settings:geometryTolerance for more information
+
+Your image and mask do not have exactly the same geometry, i.e., pixel spacing and/or origin, for which PyRadiomics applies a tolerance
+which you do not meet, see also https://pyradiomics.readthedocs.io/en/latest/faq.html?highlight=resample#geometry-mismatch-between-image-and-mask.
+Up to you to inspect why this has happened and if this is correct or not. In ``WORC``, to fix this issue, you can simply set the
+``["General"]["AssumeSameImageAndMaskMetadata"]`` parameter to ``True``: in this way, in the preprocessing step, ``WORC`` will simply
+copy-paste the metadata from the image to your segmentation to ensure they are the same. If you are using ``BasicWORC`` or ``SimpleWORC``,
+simply add the following:
+
+.. code-block:: python
+    overrides = {
+        'Classification': {
+            'classifiers': 'SVM',
+          },
+      }
+    experiment.add_config_overrides(overrides)
+
 Other
 -----
 
@@ -208,3 +257,5 @@ My jobs on the BIGR cluster get cancelled due to memory errors
 You can adjust the memory for various jobs through changing the values in the ``WORC.fastr_memory_parameters`` dictionary 
 (accesible in ``SimpleWORC`` and ``BasicWORC`` through ``_worc.fastr_memory_parameters``.) The fit_and_score job
 memory can be adjusted through the WORC HyperOptimization config, see :ref:`Configuration chapter <config-chapter>`.
+
+
diff --git a/WORC/featureprocessing/StatisticalTestFeatures.py b/WORC/featureprocessing/StatisticalTestFeatures.py
@@ -274,7 +274,7 @@ def StatisticalTestFeatures(features, patientinfo, config, output_csv=None,
                 labels.append(1)
             elif 'of_' in o.lower():
                 labels.append(2)
-            elif 'glcm_' in o or 'glcmms_' in o.lower():
+            elif 'glcm_' in o.lower() or 'glcmms_' in o.lower():
                 labels.append(3)
             elif 'glrlm_' in o.lower():
                 labels.append(4)

diff --git a/WORC/processing/label_processing.py b/WORC/processing/label_processing.py
@@ -62,7 +62,7 @@ def load_labels(label_file, label_type=None):
     for i_label in label_type:
         label_index = np.where(label_names == i_label)[0]
         if label_index.size == 0:
-            raise ae.WORCValueError('Could not find label: ' + str(i_label))
+            raise ae.WORCValueError(f'Could not find label: {i_label}: only label names present are {label_names}.')
         else:
             labels.append(label_status[:, label_index])
 

diff --git a/WORC/resources/fastr_types/HDF5.py b/WORC/resources/fastr_types/HDF5.py
@@ -33,12 +33,5 @@ def _validate(self):
         if not os.path.isfile(parsed_value):
             return False
 
-        try:
-            # Read the file and extract features
-            data = pd.read_hdf(parsed_value)
-            return True
-
-        except HDF5ExtError:
-            # Not a valid hdf5 file
-            return False
+        return True
 
diff --git a/WORC/tools/Inference.py b/WORC/tools/Inference.py