[DOC] improve documentation of return_id, set return_id to True as de…

…fault addresses #140
MannLabs · Feb 15, 2025 · c4a19e8 · c4a19e8
1 parent 7bd6790
commit c4a19e8
Showing 1 changed file with 15 additions and 8 deletions.
diff --git a/src/scportrait/tools/ml/datasets.py b/src/scportrait/tools/ml/datasets.py
@@ -25,7 +25,7 @@ def __init__(
         index_list: list[list[int]] | None = None,
         select_channel: list[int] | int | None = None,
         transform=None,
-        return_id: bool = False,
+        return_id: bool = True,
         max_level: int = 5,
     ):
         """
@@ -34,7 +34,9 @@ def __init__(
             index_list: List of cell indices to select from the dataset. If set to None all cells are taken. Default is None.
             select_channel: Specify a specific channel or selection of channels to select from the data. Default is None, which returns all channels. Using this operation is more efficient than if this selection occurs via a passed transform.
             transform: An optional user-defined function to apply transformations to the data. Default is None.
-            return_id: Whether to return the index of the cell with the data. Default is False.
+            return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
+                For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`,
+                otherwise you can no longer identify the source cell returning a specific result.
             max_level: Maximum levels of directory to search for hdf5 files in the passed paths. Default is 5.
         """
         self.dir_list = dir_list
@@ -383,9 +385,12 @@ class HDF5SingleCellDataset(_HDF5SingleCellDataset):
         dir_labels: List of bulk labels applied to all cells within each dataset in `dir_list`.
         index_list: List of indices to select from the dataset. If `None`, all cells are included. Default is `None`.
         select_channel: Specific channel or list of channels to retrieve from the data. Default is `None`, which returns all channels.
-            This is more efficient than performing selection via a transform function.
+            This is more efficient than performing selection via a transform function as the data is never read in the first place.
         transform: User-defined function to apply transformations to the data. Default is `None`.
         return_id: Whether to return the unique cell-id of the cell along with the data. Default is `False`.
+        return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
+            For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`, otherwise
+            you can no longer identify the source cell returning a specific result.
         max_level (int, optional):
             Maximum number of directory levels to search for HDF5 files within the provided paths. Default is `5`.
 
@@ -417,7 +422,7 @@ def __init__(
         index_list: list[list[int]] | None = None,  # list of indices to select from the index
         transform=None,
         max_level: int = 5,
-        return_id: bool = False,
+        return_id: bool = True,
         select_channel: int | list[int] | None = None,
     ):
         super().__init__(
@@ -452,10 +457,12 @@ class LabelledHDF5SingleCellDataset(_HDF5SingleCellDataset):
         label_column_transform: Optional function to apply a mathematical transformation to the read labels.
             For example, if the labels are stored as seconds in the HDF5 dataset, set this value to `lambda x: x / 3600` to return labels in hours.
         index_list: List of indices to select from the dataset. If `None`, all cells are included. Default is `None`.
-        select_channel: Specific channel or selection of channels to retrieve from the data. Default is `None`, which returns all channels.
-            This is more efficient than performing selection via a transform function.
+        select_channel: Specific channel or list of channels to retrieve from the data. Default is `None`, which returns all channels.
+            This is more efficient than performing selection via a transform function as the data is never read in the first place.
         transform: Optional user-defined function to apply transformations to the data. Default is `None`.
-        return_id: Whether to return the unique cell-id of the cell along with the data. Default is `False`.
+        return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
+            For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`, otherwise
+            you can no longer identify the source cell returning a specific result.
         max_level: Maximum number of directory levels to search for HDF5 files within the provided paths. Default is `5`.
 
     Methods:
@@ -488,7 +495,7 @@ def __init__(
         index_list: list[list[int]] | None = None,  # list of indices to select from the index
         transform: Callable | None = None,
         max_level: int = 5,
-        return_id: bool = False,
+        return_id: bool = True,
         select_channel: list[int] | None | int = None,
     ):
         super().__init__(